diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..04be9f36b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,23 @@ +.git +__pycache__ +*.py[oc] +.venv +.env +.envrc +.ruff_cache +.mypy_cache +.pytest_cache +.claude +.coverage +.DS_Store +build +dist +wheels +*.egg-info +docs +site +.github +.qdrant_code_embeddings +CLAUDE.md +AGENTS.md +PROJECT.md diff --git a/.env.example b/.env.example index dc518b501..72a64c672 100644 --- a/.env.example +++ b/.env.example @@ -60,6 +60,17 @@ # CYPHER_MODEL=gemini-2.5-flash # CYPHER_API_KEY=your-google-api-key +# Example 6: LiteLLM with custom provider +# ORCHESTRATOR_PROVIDER=litellm_proxy +# ORCHESTRATOR_MODEL=gpt-oss:120b +# ORCHESTRATOR_ENDPOINT=http://litellm:4000/v1 +# ORCHESTRATOR_API_KEY=sk-your-litellm-key + +# CYPHER_PROVIDER=litellm_proxy +# CYPHER_MODEL=openrouter/gpt-oss:120b +# CYPHER_ENDPOINT=http://litellm:4000/v1 +# CYPHER_API_KEY=sk-your-litellm-key + # Thinking budget for reasoning models (optional) # ORCHESTRATOR_THINKING_BUDGET=10000 # CYPHER_THINKING_BUDGET=5000 @@ -68,6 +79,12 @@ MEMGRAPH_HOST=localhost MEMGRAPH_PORT=7687 MEMGRAPH_HTTP_PORT=7444 +# Memgraph authentication credentials +# Leave MEMGRAPH_USERNAME empty (or omit it) if your Memgraph instance doesn't require authentication +# If authentication is enabled, provide both username and password +# Common defaults: username=neo4j, password=password (or your custom credentials) +MEMGRAPH_USERNAME= +MEMGRAPH_PASSWORD= LAB_PORT=3000 MEMGRAPH_BATCH_SIZE=1000 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..49ff9c712 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @vitali87 diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index d5f29c336..163b5ae21 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -3,5 +3,4 @@ github: vitali87 buy_me_a_coffee: vitali87 -# Uncomment and add username when you set up Patreon: -# patreon: YOUR_USERNAME +patreon: vitali87 diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 4b6f8f59b..008667c7d 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,4 +1,4 @@ -blank_issues_enabled: false +blank_issues_enabled: true contact_links: - name: 💬 Discussions url: https://github.com/vitali87/code-graph-rag/discussions diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..a075b29ee --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,16 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "docker" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000..8dc054f6c --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,38 @@ +## Summary + + + +- + +## Type of Change + + + +- [ ] Bug fix +- [ ] New feature +- [ ] Performance improvement +- [ ] Refactoring (no functional changes) +- [ ] Documentation +- [ ] CI/CD or tooling +- [ ] Dependencies + +## Related Issues + + + +## Test Plan + + + +- [ ] Unit tests pass (`make test-parallel` or `uv run pytest -n auto -m "not integration"`) +- [ ] New tests added +- [ ] Integration tests pass (`make test-integration`, requires Docker) +- [ ] Manual testing (describe below) + +## Checklist + +- [ ] PR title follows [Conventional Commits](https://www.conventionalcommits.org/) format +- [ ] All pre-commit checks pass (`make pre-commit`) +- [ ] No hardcoded strings in non-config/non-constants files +- [ ] No `# type: ignore`, `cast()`, `Any`, or `object` type hints +- [ ] No new comments or docstrings (code should be self-documenting) diff --git a/.github/workflows/build-binaries.yml b/.github/workflows/build-binaries.yml index c548d82ea..315cfa45a 100644 --- a/.github/workflows/build-binaries.yml +++ b/.github/workflows/build-binaries.yml @@ -8,10 +8,14 @@ on: release: types: [created] +permissions: read-all + jobs: build: name: Build ${{ matrix.platform }}-${{ matrix.arch }} runs-on: ${{ matrix.os }} + permissions: + contents: write timeout-minutes: 30 strategy: fail-fast: false @@ -32,18 +36,18 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 submodules: recursive - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" @@ -66,7 +70,7 @@ jobs: fi - name: Upload binary artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: code-graph-rag-${{ matrix.platform }}-${{ matrix.arch }} path: dist/code-graph-rag-* @@ -75,7 +79,39 @@ jobs: - name: Upload to release if: startsWith(github.ref, 'refs/tags/v') - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@a06a81a03ee405af7f2048a818ed3f03bbf83c7b # v2 with: files: dist/code-graph-rag-* fail_on_unmatched_files: true + + sign-release: + name: Sign Release Artifacts + if: startsWith(github.ref, 'refs/tags/v') + needs: build + runs-on: ubuntu-latest + permissions: + contents: write + id-token: write + steps: + - name: Install cosign + uses: sigstore/cosign-installer@ba7bc0a3fef59531c69a25acd34668d6d3fe6f22 # v4.1.0 + + - name: Download all artifacts + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + with: + path: artifacts + merge-multiple: true + + - name: Sign artifacts + shell: bash + run: | + for f in artifacts/*; do + [ -f "$f" ] || continue + cosign sign-blob --yes --bundle "${f}.sigstore.json" "$f" + done + + - name: Upload signatures to release + uses: softprops/action-gh-release@a06a81a03ee405af7f2048a818ed3f03bbf83c7b # v2 + with: + files: artifacts/*.sigstore.json + fail_on_unmatched_files: false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 43b0cc8db..a7742b439 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,8 @@ on: branches: [main, master, develop] workflow_dispatch: +permissions: read-all + concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true @@ -19,16 +21,16 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" @@ -51,16 +53,16 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" @@ -75,7 +77,7 @@ jobs: test-unit: name: Unit Tests (${{ matrix.os }}) runs-on: ${{ matrix.os }} - timeout-minutes: 15 + timeout-minutes: 20 strategy: fail-fast: false matrix: @@ -83,19 +85,19 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: recursive fetch-depth: 0 - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" @@ -103,13 +105,19 @@ jobs: run: | uv sync --extra treesitter-full --extra test --extra semantic --group dev - - name: Run unit tests (parallel) + - name: Run unit tests (parallel, with coverage) + if: matrix.os == 'macos-latest' run: | uv run pytest -n auto -m "not integration" --tb=short --cov=codebase_rag --cov-report=xml --cov-report=term + - name: Run unit tests (parallel, no coverage) + if: matrix.os != 'macos-latest' + run: | + uv run pytest -n auto -m "not integration" --tb=short + - name: Upload coverage to Codecov - if: always() && secrets.CODECOV_TOKEN != '' - uses: codecov/codecov-action@v4 + if: always() && matrix.os == 'macos-latest' + uses: codecov/codecov-action@1af58845a975a7985b0beb0cbe6fbbb71a41dbad # v5.5.3 with: files: ./coverage.xml flags: unit-${{ matrix.os }} @@ -123,7 +131,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: recursive fetch-depth: 0 @@ -133,7 +141,7 @@ jobs: docker run -d --name memgraph -p 7687:7687 memgraph/memgraph-platform:latest echo "Waiting for Memgraph to start..." for i in {1..30}; do - if docker exec memgraph echo "SELECT 1;" 2>/dev/null; then + if docker exec memgraph mgconsole --no-history -c "RETURN 1;" 2>/dev/null; then echo "Memgraph is ready!" break fi @@ -142,13 +150,13 @@ jobs: done - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 with: enable-cache: true cache-dependency-glob: "uv.lock" - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" @@ -164,8 +172,8 @@ jobs: uv run pytest -m "integration" -v --tb=short --cov=codebase_rag --cov-report=xml --cov-report=term - name: Upload coverage to Codecov - if: always() && secrets.CODECOV_TOKEN != '' - uses: codecov/codecov-action@v4 + if: always() + uses: codecov/codecov-action@1af58845a975a7985b0beb0cbe6fbbb71a41dbad # v5.5.3 with: files: ./coverage.xml flags: integration-ubuntu-latest @@ -187,7 +195,7 @@ jobs: steps: - name: Check PR title format - uses: amannn/action-semantic-pull-request@v5 + uses: amannn/action-semantic-pull-request@48f256284bd46cdaab1048c3721360e808335d50 # v6.1.1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index ecd3732f3..6c0c48ebf 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -10,6 +10,8 @@ on: - "*.py" - "pyproject.toml" +permissions: read-all + jobs: claude-review: name: AI Code Review @@ -26,13 +28,13 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 1 - name: Run Claude Code Review id: claude-review - uses: anthropics/claude-code-action@beta + uses: anthropics/claude-code-action@28f83620103c48a57093dcc2837eec89e036bb9f # beta with: claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 000000000..853e4df66 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,62 @@ +name: Docker Publish + +on: + push: + tags: + - 'v*' + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +permissions: read-all + +jobs: + build-and-push: + runs-on: ubuntu-latest + timeout-minutes: 60 + permissions: + contents: read + packages: write + attestations: write + id-token: write + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3 + + - uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 + + - uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # v5 + id: meta + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=sha + + - uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7.0.0 + id: push + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - uses: actions/attest-build-provenance@96b4a1ef7235a096b17240c259729fdd70c83d45 # v2 + with: + subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + subject-digest: ${{ steps.push.outputs.digest }} + push-to-registry: true diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000..912c8eb02 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,58 @@ +name: Deploy Documentation + +on: + push: + branches: + - main + paths: + - "docs/**" + - "mkdocs.yml" + # (H) Rebuilds periodically so the GitHub repo widget (version, stars, forks) + # stays current; MkDocs Material fetches these stats at build time. + schedule: + - cron: "0 */6 * * *" + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: false + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 + with: + enable-cache: true + cache-dependency-glob: "uv.lock" + + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Install dependencies + run: uv sync --group docs + + - name: Build site + run: uv run mkdocs build --strict + + - uses: actions/upload-pages-artifact@7b1f4a764d45c48632c6b24a0339c27f5614fb0b # v4.0.0 + with: + path: site + + deploy: + needs: build + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - id: deployment + uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4 diff --git a/.github/workflows/label-sync.yml b/.github/workflows/label-sync.yml index ec787447e..40cc0e2c0 100644 --- a/.github/workflows/label-sync.yml +++ b/.github/workflows/label-sync.yml @@ -9,9 +9,10 @@ on: - ".github/workflows/label-sync.yml" workflow_dispatch: schedule: - # Run weekly on Mondays at 00:00 UTC to ensure labels stay in sync - cron: "0 0 * * 1" +permissions: read-all + jobs: sync-labels: name: Sync Repository Labels @@ -22,10 +23,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Sync labels - uses: micnncim/action-label-syncer@v1 + uses: micnncim/action-label-syncer@3abd5ab72fda571e69fffd97bd4e0033dd5f495c # v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/osv-scanner.yml b/.github/workflows/osv-scanner.yml new file mode 100644 index 000000000..5ac2a0a24 --- /dev/null +++ b/.github/workflows/osv-scanner.yml @@ -0,0 +1,50 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# A sample workflow which sets up periodic OSV-Scanner scanning for vulnerabilities, +# in addition to a PR check which fails if new vulnerabilities are introduced. +# +# For more examples and options, including how to ignore specific vulnerabilities, +# see https://google.github.io/osv-scanner/github-action/ + +name: OSV-Scanner + +on: + pull_request: + branches: [ "main" ] + merge_group: + branches: [ "main" ] + schedule: + - cron: '29 2 * * 4' + push: + branches: [ "main" ] + +permissions: read-all + +jobs: + scan-scheduled: + if: ${{ github.event_name == 'push' || github.event_name == 'schedule' }} + uses: google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@c5996e0193a3df57d695c1b8a1dec2a4c62e8730 # v2.3.3 + permissions: + actions: read + security-events: write + contents: read + with: + scan-args: |- + -r + --skip-git + ./ + scan-pr: + if: ${{ github.event_name == 'pull_request' || github.event_name == 'merge_group' }} + uses: google/osv-scanner-action/.github/workflows/osv-scanner-reusable-pr.yml@c5996e0193a3df57d695c1b8a1dec2a4c62e8730 # v2.3.3 + permissions: + actions: read + security-events: write + contents: read + with: + scan-args: |- + -r + --skip-git + ./ diff --git a/.github/workflows/poor-quality-management.yml b/.github/workflows/poor-quality-management.yml index df73ada89..657a86dae 100644 --- a/.github/workflows/poor-quality-management.yml +++ b/.github/workflows/poor-quality-management.yml @@ -4,9 +4,11 @@ on: pull_request_target: types: [labeled] schedule: - - cron: "0 9 * * *" # Daily at 9 AM UTC + - cron: "0 9 * * *" workflow_dispatch: +permissions: read-all + jobs: notify-poor-quality: name: Notify Poor Quality PR @@ -19,7 +21,7 @@ jobs: steps: - name: Add warning comment - uses: actions/github-script@v7 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | const message = `⚠️ **This PR has been marked as poor-quality.** @@ -73,7 +75,7 @@ jobs: steps: - name: Close PRs with poor-quality label older than 7 days - uses: actions/github-script@v7 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | const LABEL_NAME = 'poor-quality'; diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 000000000..1201a3a14 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,38 @@ +name: Publish to PyPI + +on: + release: + types: [published] + +permissions: read-all + +jobs: + publish: + name: Publish to PyPI + runs-on: ubuntu-latest + timeout-minutes: 10 + environment: pypi + permissions: + id-token: write + contents: read + + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install uv + uses: astral-sh/setup-uv@e4db8464a088ece1b920f60402e813ea4de65b8f # v4 + with: + enable-cache: true + cache-dependency-glob: "uv.lock" + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Build package + run: uv build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1 diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 000000000..08b117574 --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,78 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + - cron: '32 23 * * 2' + push: + branches: [ "main" ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + # `publish_results: true` only works when run from the default branch. conditional can be removed if disabled. + if: github.event.repository.default_branch == github.ref_name || github.event_name == 'pull_request' + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + # Uncomment the permissions below if installing in a private repository. + # contents: read + # actions: read + + steps: + - name: "Checkout code" + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecard on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: true + + # (Optional) Uncomment file_mode if you have a .gitattributes with files marked export-ignore + # file_mode: git + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard (optional). + # Commenting out will disable upload of results to your repo's Code Scanning dashboard + - name: "Upload to code-scanning" + uses: github/codeql-action/upload-sarif@38697555549f1db7851b81482ff19f1fa5c4fedc # v3 + with: + sarif_file: results.sarif diff --git a/.github/workflows/sonarcloud.yml b/.github/workflows/sonarcloud.yml new file mode 100644 index 000000000..123b16f0a --- /dev/null +++ b/.github/workflows/sonarcloud.yml @@ -0,0 +1,45 @@ +name: SonarCloud + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + sonarcloud: + name: SonarCloud Analysis + if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + + - name: Install uv + uses: astral-sh/setup-uv@38f3f104447c67c051c4a08e39b64a148898af3a # v4 + with: + enable-cache: true + cache-dependency-glob: "uv.lock" + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Install dependencies + run: uv sync --extra treesitter-full --extra test --extra semantic --group dev + + - name: Run tests with coverage + run: uv run pytest -n auto -m "not integration" --tb=short --cov=codebase_rag --cov-report=xml + + - name: SonarCloud Scan + uses: SonarSource/sonarqube-scan-action@fd88b7d7ccbaefd23d8f36f73b59db7a3d246602 # v6 + env: + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} diff --git a/.github/workflows/split-score.yml b/.github/workflows/split-score.yml new file mode 100644 index 000000000..7c65ac2e2 --- /dev/null +++ b/.github/workflows/split-score.yml @@ -0,0 +1,22 @@ +name: PR Split Score + +on: + pull_request: + branches: [main] + +permissions: + contents: read + pull-requests: write + +jobs: + score: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: pr-split score + uses: vitali87/pr-split@v1.0.0 + with: + max-loc: "400" diff --git a/.github/workflows/version-bump.yml b/.github/workflows/version-bump.yml index 0940adcad..596a01ccd 100644 --- a/.github/workflows/version-bump.yml +++ b/.github/workflows/version-bump.yml @@ -16,6 +16,8 @@ on: - minor - major +permissions: read-all + jobs: bump-version: name: Auto Version Bump @@ -26,7 +28,7 @@ jobs: contents: write steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 2 token: ${{ secrets.GITHUB_TOKEN }} @@ -90,12 +92,17 @@ jobs: run: | sed -i 's/^version = ".*"/version = "${{ steps.bump_version.outputs.new }}"/' pyproject.toml + - name: Update server.json + if: steps.check_manual.outputs.skip == 'false' + run: | + sed -i 's/"version": "[^"]*"/"version": "${{ steps.bump_version.outputs.new }}"/g' server.json + - name: Commit version bump if: steps.check_manual.outputs.skip == 'false' run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - git add pyproject.toml + git add pyproject.toml server.json git commit -m "chore: bump version to ${{ steps.bump_version.outputs.new }}" git push diff --git a/.gitignore b/.gitignore index 4b6211856..aff67adaf 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ PROJECT.md .DS_Store .pypi_cache.json .omc +site/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 92a09727a..ec74ba6f1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,23 +5,24 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml + args: [--unsafe] - id: check-toml - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.12.2 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - exclude: ^codec/schema_pb2\.(py|pyi)$ + exclude: ^(codec/schema_pb2\.(py|pyi)|benchmarks/|optimize/)$ - id: ruff-format - exclude: ^codec/schema_pb2\.(py|pyi)$ + exclude: ^(codec/schema_pb2\.(py|pyi)|benchmarks/|optimize/)$ - repo: local hooks: - id: ty name: ty check - entry: uv run ty check --exclude codebase_rag/tests/ + entry: uv run ty check --exclude codebase_rag/tests/ --exclude benchmarks/ --exclude optimize/ language: system types: [python] - exclude: ^codec/.*_pb2\.py$ + exclude: ^(codec/.*_pb2\.py|benchmarks/|optimize/)$ pass_filenames: false - repo: local hooks: @@ -30,7 +31,7 @@ repos: entry: uv run python scripts/check_no_docs.py language: system types: [python] - exclude: ^codec/schema_pb2\.py$ + exclude: ^(codec/schema_pb2\.py|benchmarks/|optimize/) - repo: local hooks: - id: generate-readme @@ -45,7 +46,7 @@ repos: - id: bandit args: ["-c", "pyproject.toml", "--severity-level", "high"] additional_dependencies: ["bandit[toml]"] - exclude: ^(codebase_rag/tests/|scripts/) + exclude: ^(codebase_rag/tests/|scripts/|benchmarks/|optimize/) - repo: https://github.com/compilerla/conventional-pre-commit rev: v4.2.0 hooks: diff --git a/BENCHMARK_REPORT.md b/BENCHMARK_REPORT.md new file mode 100644 index 000000000..d96875e01 --- /dev/null +++ b/BENCHMARK_REPORT.md @@ -0,0 +1,199 @@ +# Benchmark Report: Measured vs Projected Performance + +## Methodology + +All benchmarks ran on macOS (Darwin 25.3.0), Python 3.12, using `uv run`. Each benchmark used: +- 3 warmup runs (discarded) +- 20 to 100 measured iterations (depending on benchmark) +- Statistical measures: median, mean, stddev, min, max, p95 +- Realistic data sizes matching the profiled workload (352 files, ~4,500 registry entries) + +Benchmark scripts are in `benchmarks/`. Run all with `uv run python benchmarks/run_all.py`. + +--- + +## FINDING 1: `find_ending_with` Linear Scan (48.3% of CPU) + +**The single biggest performance win available, requiring zero dependencies.** + +The `FunctionRegistryTrie.find_ending_with()` method falls back to a linear scan of all entries when the `_simple_name_lookup` index misses (80.7% miss rate per profiling data). + +### Measured Results + +| Scenario | Registry Size | Queries | Linear Scan (ms) | Full Suffix Index (ms) | Speedup | +|---|---|---|---|---|---| +| Batch lookup | 1,000 | 38 | 1.77 | 0.007 | **261x** | +| Batch lookup | 4,500 | 38 | 8.04 | 0.023 | **356x** | +| Batch lookup | 10,000 | 38 | 17.78 | 0.046 | **382x** | +| Single lookup | 4,500 | 1 | 0.22 | 0.001 | **178x** | + +### Projected vs Measured + +The integration feasibility report projected ~1.9x total speedup (saving 13.5s of 31.2s). Our benchmarks show that building a complete suffix index provides **178x to 382x speedup** on the specific operation, validating the projection and suggesting the total improvement could be even larger than estimated. + +### Fix + +Build a complete suffix index in `FunctionRegistryTrie` by populating `_simple_name_lookup` for every insert, and ensure all insertion code paths (including `__setitem__`) update the index. This eliminates the linear scan fallback entirely. + +--- + +## FINDING 2: pathlib vs String Operations (13.7% of CPU) + +**The `should_skip_path` function uses `pathlib.Path.relative_to()` which creates intermediate objects on every call.** + +### Measured Results + +| Operation | pathlib (ms) | String ops (ms) | Speedup | +|---|---|---|---| +| `relative_to` vs `removeprefix` (5,000 paths) | 61.3 | 0.097 | **634x** | +| `relative_to` vs `removeprefix` (20,000 paths) | 253.0 | 0.394 | **643x** | +| Full `should_skip_path` (5,000 paths) | 69.3 | 1.55 | **45x** | +| Full `should_skip_path` (20,000 paths) | 285.9 | 6.21 | **46x** | +| `Path.suffix` vs `str.rfind` (5,000 paths) | 6.97 | 0.278 | **25x** | +| `Path.name` vs `str.rfind+slice` (5,000 paths) | 6.37 | 0.360 | **18x** | + +### Projected vs Measured + +The integration report projected 4.0s savings (13.7% of 31.2s total). Our benchmarks show `pathlib.relative_to` is 634x slower than `str.removeprefix`, and the full `should_skip_path` function is 45x slower with pathlib. These numbers validate the projection: for 59,012 calls at ~57us/call (pathlib), the total is ~3.4s, matching the profiled 3.39s. + +### Fix + +Convert paths to strings at the boundary of `should_skip_path` and use `str.removeprefix()`, `str.split("/")`, and `set` membership testing instead of `Path.relative_to()` and `Path.parts`. + +--- + +## FINDING 3: orjson vs stdlib json (JSON Serialization) + +**orjson provides massive speedups on serialization with zero integration overhead.** + +### Measured Results + +| Operation | Data Size | json (ms) | orjson (ms) | Speedup | +|---|---|---|---|---| +| dumps compact | 372 KB | 1.16 | 0.21 | **5.5x** | +| dumps compact | 1.9 MB | 5.73 | 1.01 | **5.7x** | +| dumps compact | 8.5 MB | 26.6 | 4.91 | **5.4x** | +| dumps indented | 372 KB | 9.70 | 0.39 | **24.7x** | +| dumps indented | 1.9 MB | 48.5 | 2.02 | **24.0x** | +| dumps indented | 8.5 MB | 216.9 | 8.58 | **25.3x** | +| loads | 372 KB | 1.26 | 0.62 | **2.0x** | +| loads | 1.9 MB | 6.23 | 3.24 | **1.9x** | +| loads | 8.5 MB | 30.1 | 16.6 | **1.8x** | + +### Projected vs Measured + +The language recommendations projected 5x to 15x. Our measured results show: +- **Compact serialization: 5.4x to 5.7x** (within projected range) +- **Indented serialization: 24x to 25x** (exceeds projected range significantly) +- **Deserialization: 1.8x to 2.0x** (below projected range) + +The indented serialization speedup is particularly relevant because `_write_graph_json` uses `json.dump(data, f, indent=2)` (the slowest path). For a 20K node graph, this drops from 217ms to 8.6ms. + +--- + +## FINDING 4: BLAKE3 vs SHA256 Hashing (NEGATIVE RESULT) + +**BLAKE3 is slower than hashlib.sha256 for this workload. The recommendation is invalidated.** + +### Measured Results + +| Operation | SHA256 (ms) | BLAKE3 (ms) | Speedup | +|---|---|---|---| +| 500 snippet hashes | 0.155 | 0.325 | **0.5x (slower)** | +| 2,000 snippet hashes | 0.594 | 1.177 | **0.5x (slower)** | +| 10,000 snippet hashes | 2.988 | 6.131 | **0.5x (slower)** | +| 50 file hashes (5KB avg) | 0.968 | 1.031 | **0.9x (slower)** | +| 200 file hashes (10KB avg) | 4.419 | 4.964 | **0.9x (slower)** | +| 500 file hashes (20KB avg) | 14.164 | 15.883 | **0.9x (slower)** | + +### Analysis + +The language recommendations projected 4x to 10x speedup. Our benchmarks show BLAKE3 is actually **0.5x to 0.9x** (slower) for this workload. This is because: + +1. **hashlib.sha256 is already C-backed** (OpenSSL). The baseline is not pure Python. +2. **BLAKE3's SIMD advantages require large contiguous buffers.** Code snippets average 200 bytes; file chunks are 5-20KB. BLAKE3's parallelism does not engage at these sizes. +3. **FFI overhead dominates.** The `blake3` Python package adds per-call FFI overhead that exceeds the algorithmic savings for small inputs. + +**Verdict: Do not adopt BLAKE3.** The recommendation was based on algorithmic benchmarks, not Python binding benchmarks. + +--- + +## FINDING 5: FunctionRegistryTrie Baseline Performance + +### Measured Results (Existing Python Implementation) + +| Operation | 1K entries | 5K entries | 10K entries | 50K entries | +|---|---|---|---|---| +| insert (ms) | 0.33 | 1.76 | 3.74 | 18.1 | +| lookup (ms) | 0.04 | 0.19 | 0.41 | 2.06 | +| find_ending_with (ms) | 0.004 | 0.018 | 0.046 | 0.47 | +| find_with_prefix (ms) | 0.39 | 2.18 | 4.18 | 39.9 | +| delete 25% (ms) | 0.42 | 2.10 | 4.20 | 22.2 | + +### Analysis + +The trie operations are already fast when the index is hit (O(1) via `_simple_name_lookup`). The Rust trie rewrite (projected 3x to 8x) would save microseconds per operation. The integration feasibility report correctly identified that a standalone Rust trie provides only 1.5x to 3x net gain after FFI overhead. The **pure Python fix (Finding 1) provides 178x to 382x speedup** on the actual bottleneck, making the Rust rewrite unnecessary. + +--- + +## FINDING 6: GraphLoader JSON Parse + Index Build + +### Measured Results + +| Graph Size | JSON Parse Only (ms) | GraphLoader.load (ms) | Index Build Overhead | +|---|---|---|---| +| 1K nodes, 2K rels | 1.03 | 2.10 | 2.0x | +| 5K nodes, 10K rels | 5.15 | 10.6 | 2.1x | +| 20K nodes, 50K rels | 24.2 | 64.2 | 2.7x | + +### Analysis + +GraphLoader.load() is 2x to 2.7x slower than raw JSON parsing due to index construction (node-by-id, node-by-label, outgoing/incoming relationship indexes). With orjson, the JSON parse portion would drop from 24.2ms to ~13.4ms (1.8x), but index construction would remain unchanged. Net improvement for 20K nodes: 64.2ms to ~53ms (1.2x). The index construction is pure Python dict/list operations. + +--- + +## FINDING 7: File Hashing Comparison + +### Measured Results + +| Algorithm | 50 files (5KB) | 200 files (10KB) | 500 files (20KB) | +|---|---|---|---| +| SHA256 (8KB buffer) | 0.98ms | 4.43ms | 14.3ms | +| SHA256 (64KB buffer) | 1.05ms | 4.61ms | 14.9ms | +| SHA256 (mmap) | 1.30ms | 5.76ms | 17.4ms | +| MD5 | 1.22ms | 6.44ms | 24.7ms | +| BLAKE2b | 1.04ms | 5.17ms | 17.5ms | + +### Analysis + +SHA256 with 8KB buffer is already the fastest option. Larger buffers and mmap add overhead for these file sizes. MD5 is slower (no hardware acceleration on this platform). File hashing consumes <0.5% of total runtime. No optimization needed. + +--- + +## Summary: Validated vs Invalidated Recommendations + +| Recommendation | Language Report Projection | Measured Result | Verdict | +|---|---|---|---| +| Fix `find_ending_with` index | ~1.9x total speedup | **261x to 382x** on the operation | **VALIDATED (exceeds projection)** | +| Replace pathlib with strings | ~1.15x total speedup | **45x to 643x** on path ops | **VALIDATED (exceeds projection)** | +| orjson for JSON | 5x to 15x on JSON ops | **1.8x to 25x** depending on operation | **VALIDATED** | +| BLAKE3 for hashing | 4x to 10x speedup | **0.5x (slower)** | **INVALIDATED** | +| neo4j-rust-ext | 3x to 10x on DB ops | N/A (wrong driver) | **INVALIDATED** (uses Memgraph/pymgclient) | +| Rust AST extension | 10x to 16x on parsing | Not benchmarked (3.1% of CPU) | **DEPRIORITIZED** (targets 3.1% of runtime) | +| Rust trie | 3x to 8x on lookups | 1.5x to 3x net (per feasibility) | **SUPERSEDED** by Python index fix | + +## Revised Priority Order (Measured) + +| Priority | Fix | Type | Measured Speedup | Effort | +|---|---|---|---|---| +| **1** | Fix `find_ending_with` suffix index | Python bugfix | 261x to 382x on operation (~1.9x total) | Low | +| **2** | Replace pathlib with string ops | Python refactor | 45x to 643x on path ops (~1.15x total) | Low | +| **3** | Cache type inference results | Python memoization | Not benchmarked (projected ~1.07x total) | Low | +| **4** | Suppress debug logging | Config change | Not benchmarked (projected ~1.06x total) | Trivial | +| **5** | Deduplicate FS traversal | Python refactor | Not benchmarked (projected ~1.05x total) | Low | +| **6** | orjson for JSON | Dependency swap | 5.4x to 25x on JSON ops | Trivial | +| **7** | Rust AST extension | Rust crate | Targets 3.1% of CPU; ~1.03x total after Python fixes | High | + +**Combined estimated speedup from priorities 1 through 6: ~3.7x, with zero language rewrites.** + +The Rust AST extension (previously the headline recommendation at "10x to 16x") targets only 3.1% of actual CPU time and provides ~1.03x total improvement after the pure Python fixes are applied. It should only be considered for repositories significantly larger than the current benchmark workload. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..9b47f9561 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +eheva87@gmail.com. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..e965de91d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,51 @@ +FROM ghcr.io/astral-sh/uv:0.10@sha256:72ab0aeb448090480ccabb99fb5f52b0dc3c71923bffb5e2e26517a1c27b7fec AS uv + +FROM python:3.14-slim@sha256:fb83750094b46fd6b8adaa80f66e2302ecbe45d513f6cece637a841e1025b4ca AS builder + +COPY --from=uv /uv /uvx /bin/ + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + cmake build-essential libssl-dev zlib1g-dev libzstd-dev && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY pyproject.toml uv.lock ./ +RUN uv sync --frozen --no-dev --extra treesitter-full --no-install-project --no-binary-package pymgclient + +COPY . . +RUN uv sync --frozen --no-dev --extra treesitter-full --no-binary-package pymgclient + +FROM python:3.14-slim@sha256:fb83750094b46fd6b8adaa80f66e2302ecbe45d513f6cece637a841e1025b4ca + +RUN apt-get update && \ + apt-get install -y --no-install-recommends ripgrep libssl3 zlib1g libzstd1 && \ + rm -rf /var/lib/apt/lists/* + +RUN useradd --create-home appuser +USER appuser +WORKDIR /app + +COPY --from=builder --chown=appuser:appuser /app/.venv /app/.venv +COPY --from=builder --chown=appuser:appuser /app/codebase_rag /app/codebase_rag +COPY --from=builder --chown=appuser:appuser /app/codec /app/codec +COPY --from=builder --chown=appuser:appuser /app/cgr /app/cgr +COPY --from=builder --chown=appuser:appuser /app/pyproject.toml /app/pyproject.toml + +ENV PATH="/app/.venv/bin:$PATH" + +COPY --chmod=755 <<'EOF' /app/entrypoint.sh +#!/bin/sh +ARCH=$(uname -m) +case "$ARCH" in + x86_64) LIBDIR="/lib/x86_64-linux-gnu" ;; + aarch64) LIBDIR="/lib/aarch64-linux-gnu" ;; + *) LIBDIR="/lib" ;; +esac +export LD_PRELOAD="$LIBDIR/libz.so.1:$LIBDIR/libzstd.so.1" +exec code-graph-rag "$@" +EOF + +ENTRYPOINT ["/app/entrypoint.sh"] +CMD ["mcp-server"] diff --git a/INTEGRATION_FEASIBILITY.md b/INTEGRATION_FEASIBILITY.md new file mode 100644 index 000000000..b65a9da31 --- /dev/null +++ b/INTEGRATION_FEASIBILITY.md @@ -0,0 +1,392 @@ +# Integration Feasibility Report + +## Build System and Deployment Context + +**Package manager:** `uv` (Astral), defined in `pyproject.toml` with `uv.lock` +**Build backend:** setuptools (via `[tool.setuptools]`), three packages: `codebase_rag`, `codec`, `cgr` +**Distribution:** PyPI wheel, Docker image (`python:3.12-slim`), PyInstaller binary +**CI/CD:** Pre-commit hooks (ruff, ty, bandit), Makefile targets +**Python version:** 3.12+ required +**Key native dependency:** `pymgclient` (compiled from source with `--no-binary-package`) + +--- + +## Candidate 1: orjson (Drop-in JSON Replacement) + +### Integration Strategy +Drop-in dependency swap. Replace `import json` with `import orjson` in graph_loader.py, graph_updater.py, services/graph_service.py, embedder.py, stdlib_extractor.py. + +### Integration Overhead +- **Serialization boundary:** Zero. orjson is a direct Python C extension. No FFI marshalling. +- **API difference:** `orjson.dumps()` returns `bytes` not `str`. Every `json.dumps()` call site that feeds the result to something expecting `str` needs `.decode()`. In this codebase, the `_write_graph_json` function in `main.py` uses `json.dump(graph_data, f, indent=2, ensure_ascii=False)` which would need adjustment since orjson's `OPT_INDENT_2` flag replaces the `indent` parameter. +- **Protobuf service:** `services/protobuf_service.py` does not use JSON. No impact. +- **Hash cache I/O:** `_save_hash_cache` and `_load_hash_cache` use `json.dump/load` with file objects. orjson does not support file-object streaming; need to call `orjson.dumps()` then `f.write()`. +- **Embedding cache:** Same pattern. `EmbeddingCache.save()` uses `json.dump(self._cache, f)`. Requires manual write of bytes. +- **Build system change:** Add `orjson>=3.10.0` to `[project.dependencies]`. orjson publishes pre-built wheels for all platforms. No toolchain change. +- **Docker impact:** Zero. orjson wheels are self-contained. +- **PyInstaller impact:** Add `--hidden-import orjson`. orjson is a single .so/.pyd file, minimal size increase. + +### Net Projected Gain +- **Raw gain:** 5x to 15x on JSON operations +- **Integration overhead:** Near zero. ~10 call sites need minor API adjustments (bytes vs str, file.write vs json.dump). +- **Net gain:** 5x to 15x on JSON operations. No overhead erosion. +- **Risk:** Very low. Widely adopted library (polars, FastAPI, etc.) + +--- + +## Candidate 2: neo4j-rust-ext (NOT APPLICABLE) + +### Integration Strategy +NOT APPLICABLE. This codebase uses **Memgraph** via `pymgclient` (mgclient C library), NOT the Neo4j Python driver. The `neo4j-rust-ext` package patches the `neo4j` Python driver's PackStream implementation. It has zero effect on `pymgclient`. + +### Assessment +- `services/graph_service.py` imports `mgclient`, connects to Memgraph, and uses the mgclient C API directly. +- There is no `neo4j` dependency in `pyproject.toml`. +- The language researcher's recommendation was based on an incorrect assumption about the database driver. + +### Alternative for Memgraph Driver +- pymgclient is already a C extension wrapping Memgraph's C client library. It is already compiled code. +- The actual overhead is in Python-side batch construction (building `list[RelBatchRow]` and `list[NodeBatchRow]` dicts), Cypher query string formatting, and result deserialization in `_cursor_to_results`. +- The `_cursor_to_results` method iterates cursor results and builds `list[ResultRow]` via `dict(zip(column_names, row))`. This is pure Python overhead. +- Potential optimization: Use cursor iteration in C rather than Python, but this requires pymgclient changes, not neo4j-rust-ext. + +### Net Projected Gain +- **Net gain:** 0x. This recommendation is inapplicable. + +--- + +## Candidate 3: BLAKE3 (Embedding Cache Hashing) + +### Integration Strategy +Drop-in hash function replacement in `EmbeddingCache._content_hash()` and `_hash_file()` in `graph_updater.py`. + +### Integration Overhead +- **Serialization boundary:** Zero. blake3 Python package is a C extension. +- **API change:** `hashlib.sha256(content.encode()).hexdigest()` becomes `blake3.blake3(content.encode()).hexdigest()`. One-line change per call site. +- **Cache invalidation:** Existing embedding caches (`.qdrant_code_embeddings/embedding_cache.json`) and file hash caches (`.file_hashes.json`) will be invalidated because hash values change. This forces a full re-index on first run after the change. +- **Build system change:** Add `blake3>=1.0.0` to dependencies. blake3 publishes pre-built wheels. +- **Docker/PyInstaller:** Minimal impact. blake3 is a small native extension. + +### Net Projected Gain +- **Raw gain:** 4x to 10x on hashing operations +- **Practical impact:** Hashing is NOT the bottleneck. `_hash_file` reads 8KB chunks and hashes them. For a typical codebase (1000 files, avg 5KB), total hashing takes ~5ms (already fast because hashlib SHA256 is C-backed). The real I/O cost is the filesystem reads, not the hash computation. +- **Embedding cache hashing:** Similarly marginal. `_content_hash` hashes short code snippets. Each call takes microseconds. +- **Cache invalidation cost:** Forces a full re-indexing pass (potentially minutes for large repos), creating a one-time negative impact that dwarfs the per-operation savings. +- **Net gain:** Negligible in practice. The 4x to 10x improvement applies to an operation that takes microseconds per call. +- **Recommendation:** Skip unless profiling proves hashing is >5% of total wall clock time. + +--- + +## Candidate 4: Rust AST Processing Extension (via PyO3/maturin) + +### Integration Strategy +Build a Rust extension crate (e.g., `codebase-rag-core`) that accepts file bytes + language enum and returns structured extraction results. Use PyO3 for Python bindings and maturin for building. + +### Integration Overhead Assessment + +**Data crossing the FFI boundary:** +- **Input:** File bytes (`bytes`) and language enum (`str`). Minimal copy cost. PyO3 provides zero-copy access to Python bytes via `&[u8]`. +- **Output:** The Rust extension must return complex structured data to Python: + - Function definitions: list of (qualified_name, name, start_line, end_line, decorators, docstring) + - Class definitions: list of (qualified_name, name, parent_classes, methods) + - Call relationships: list of (caller_qn, callee_qn, caller_type, callee_type) + - Import mappings: dict of (module_qn -> dict of (local_name -> imported_qn)) + + Each of these requires constructing Python objects from Rust data. For a file with 50 functions and 200 call sites, this means ~250 Python dict/tuple creations on the return path. + +**Boundary crossing cost estimate:** +- PyO3 object creation: ~100ns per Python object (dict, str, list element) +- For a typical large file (50 functions, 100 calls, 20 imports): ~170 result objects * 5 fields each = ~850 Python object creations = ~85 microseconds +- Per-file processing time in Python currently: ~5-50ms (depends on file size) +- **FFI boundary cost as fraction of saved time: <1%**. This is excellent. + +**Coupling analysis:** + +The Rust extension needs to replicate or subsume: +1. `definition_processor.py` (7.5KB): Function/class/method extraction from AST +2. `call_processor.py` (13.7KB): Call relationship extraction +3. `call_resolver.py` (24.4KB): Call resolution with trie lookups, inheritance chains, import maps +4. `import_processor.py` (40KB): Language-specific import parsing (Python, JS/TS, Java, Rust, Go, C++, Lua) +5. `function_ingest.py` (16.4KB): Function registration and qualified name resolution +6. `type_inference.py` (5.8KB) + language-specific engines: Type inference for call resolution +7. `FunctionRegistryTrie` in `graph_updater.py`: Trie data structure + +Total: ~110KB of Python code with complex multi-language logic spanning 8+ languages. + +**Build system changes:** +- Add `maturin` as build dependency +- Add a `Cargo.toml` at project root or in a subdirectory (e.g., `rust/`) +- Add `tree-sitter` and language grammar crates as Rust dependencies +- Modify `pyproject.toml` to include maturin build configuration or create a separate wheel +- CI needs Rust toolchain (rustup) installed +- Docker builder stage needs Rust toolchain (~300MB image layer increase) +- PyInstaller needs to collect the compiled .so/.pyd from the Rust extension + +**Compatibility concerns:** +- Tree-sitter versions must match between Rust and Python. The codebase uses `tree-sitter==0.25.2`. The Rust `tree-sitter` crate version must be compatible. +- The Rust extension must handle all 9 supported languages with language-specific AST patterns. +- The `IngestorProtocol` interface (ensure_node_batch, ensure_relationship_batch) is called from within the processing loop. Either the Rust extension calls back into Python (expensive, defeats the purpose) OR the Rust extension accumulates all results and returns them in bulk (preferred). + +**Critical: tree-sitter Node FFI constraint (from adversarial review):** +- Tree-sitter `Node` objects are C-level pointers that cannot be marshalled across FFI boundaries. The call resolution pipeline operates on `Node` objects thousands of times per file. +- This rules out an incremental approach (e.g., rewriting just CallResolver in Rust while keeping Python tree-sitter nodes). The Rust extension must parse files from scratch using the `tree-sitter` Rust crate directly, producing Rust-native `Node` references. +- Consequence: the Rust extension is an all-or-nothing replacement of the entire parse-extract-resolve pipeline. Incremental migration is not feasible. This increases both effort and risk. + +**Deployment complexity:** +- Requires publishing platform-specific wheels (linux-x86_64, linux-aarch64, macos-x86_64, macos-arm64, windows-x64) +- maturin handles this via GitHub Actions + `maturin[zig]` for cross-compilation +- Users without pre-built wheels need a Rust toolchain to install from source +- The Docker image build becomes significantly more complex (multi-stage with Rust) + +### Net Projected Gain +- **Raw gain:** 10x to 16x on AST processing (the primary CPU hotspot) +- **FFI boundary overhead:** <1% (excellent input/output ratio: bytes in, structured results out) +- **Build system overhead:** Significant one-time cost. Ongoing CI cost of ~2-3 min for Rust compilation per release. +- **Development effort:** High. ~110KB of Python code to rewrite in Rust, with complex multi-language pattern matching. +- **Net gain:** 9x to 15x on AST processing operations, assuming bulk return pattern. +- **Risk:** Medium-high. Large surface area, 8+ language parsers, tight coupling with existing Python data structures. +- **Recommendation:** High value but should be incremental. Start with a single language (Python parser) as proof of concept, measure actual gains, then expand. + +--- + +## Candidate 5: Rust FunctionRegistryTrie (via PyO3) + +### Integration Strategy +Expose a Rust-backed trie as a Python class via PyO3, bundled in the same crate as Candidate 4. + +### Integration Overhead Assessment + +**Data crossing the FFI boundary:** +- **Insert:** Python str -> Rust &str (zero-copy via PyO3), Rust stores owned copy. Cost: one string allocation per insert. +- **Lookup (`__contains__`, `get`):** Python str -> Rust &str (zero-copy), returns bool or Python str. Cost: near zero per lookup. +- **Batch operations (`find_ending_with`, `find_with_prefix`):** Returns list of Python strings. For a query returning 50 matches, this means 50 Python string allocations. + +**Boundary crossing cost estimate:** +- Single lookup: ~50ns (vs ~200ns in Python dict) +- `find_ending_with` returning 10 results: ~1us (vs ~50us scanning Python dict) +- The trie has hot-path usage in `call_resolver.py` where every call expression triggers 2-5 trie lookups. + +**Coupling with Candidate 4:** +- If AST processing moves to Rust (Candidate 4), the trie must also be in Rust to avoid crossing back to Python for every lookup during call resolution. +- If Candidate 4 is NOT done, the Rust trie is still useful standalone, but the benefit is reduced because the Python call resolution code still creates Python strings for every lookup key. + +**Build system changes:** +- Bundled with Candidate 4. No additional build complexity. + +### Net Projected Gain +- **Raw gain:** 3x to 8x on trie operations +- **Standalone net gain (without Candidate 4):** 1.5x to 3x. Python call resolution code still creates string objects for lookup keys. FFI crossing happens per-lookup. +- **Combined net gain (with Candidate 4):** 3x to 8x. All trie operations happen in Rust with no FFI boundary during resolution. +- **Recommendation:** Only implement together with Candidate 4. Standalone, the integration overhead cuts the gains roughly in half. + +--- + +## Candidate 6: File Processing Parallelism (Python) + +### Integration Strategy +Use `concurrent.futures.ProcessPoolExecutor` to parallelize per-file processing in `GraphUpdater._process_files()`. + +### Integration Overhead Assessment + +**Serialization at boundary:** +- Each worker process needs: file path (Path, serializable), language queries (NOT serializable: contains tree-sitter Parser, Query, Language objects which are C pointers). +- **Critical problem:** `LanguageQueries` contains `Parser`, `Query`, and `Language` objects from tree-sitter, which are C-level objects that cannot be serialized across process boundaries. +- Each worker would need to call `load_parsers()` independently, loading all language grammars (~50ms startup cost per worker). +- Results (function definitions, call relationships) are Python dicts/tuples that serialize easily. + +**State synchronization:** +- `FunctionRegistryTrie` is shared mutable state. Workers write to it during function registration, and readers need it during call resolution. +- With multiprocessing, each worker would have its own trie. Merging tries after parallel processing adds complexity. +- `import_mapping` in `ImportProcessor` is similarly shared mutable state. +- The three-pass architecture (structure -> definitions -> calls) has inherent sequential dependencies: pass 3 needs results from pass 2. + +**GIL considerations:** +- `threading.Thread` would not help because call resolution is CPU-bound Python code held by the GIL. +- `ProcessPoolExecutor` bypasses GIL but introduces serialization overhead. +- Estimated per-file serialization overhead for results: ~0.1ms per file. +- For 1000 files on 4 cores: ~25ms total serialization overhead vs ~5000ms saved. + +### Net Projected Gain +- **Raw gain:** 2x to 4x (limited by sequential passes and Amdahl's law) +- **Serialization overhead:** ~5ms for 1000 files (minimal) +- **Worker initialization overhead:** ~50ms per worker (grammar loading), amortized across files +- **Architecture complexity:** High. Requires restructuring the three-pass processing pipeline, managing shared state (trie, import maps), and handling errors across processes. +- **Net gain:** 1.5x to 3x after accounting for sequential bottlenecks (pass dependencies) +- **Recommendation:** Medium priority. Worth doing after Candidate 4 (Rust extension) is evaluated. If Candidate 4 makes per-file processing fast enough, parallelism becomes less critical. + +--- + +## Candidate 7: String Processing in Call Resolution (Rust) + +### Integration Strategy +Bundled with Candidate 4. Call resolution logic moves into the Rust AST processing extension. + +### Integration Overhead +- **Standalone:** NOT recommended. Call resolution is deeply interleaved with trie lookups, import map lookups, and AST node access. Extracting just the string processing would require marshalling all context (import maps, trie state, class inheritance) across FFI on every call. +- **Bundled with Candidate 4:** Zero additional FFI overhead. The Rust extension performs call resolution as part of the same processing pass. + +### Net Projected Gain +- **Standalone net gain:** Negative. The overhead of passing import maps and trie state across FFI for each call resolution would exceed the savings from faster string processing. +- **Bundled net gain:** 5x to 10x (absorbed into Candidate 4's gains) +- **Recommendation:** Only implement as part of Candidate 4. + +--- + +## Summary: Feasibility Verdicts + +| Candidate | Strategy | FFI Overhead | Build Impact | Net Gain | Verdict | +|---|---|---|---|---|---| +| 1. orjson | Dependency swap | None | Trivial | 5x-15x on JSON | **PROCEED** | +| 2. neo4j-rust-ext | N/A | N/A | N/A | 0x (wrong driver) | **REJECT** | +| 3. BLAKE3 hashing | Dependency swap | None | Trivial | Negligible | **SKIP** (not a bottleneck) | +| 4. Rust AST extension | PyO3/maturin crate | <1% | Significant | 9x-15x on AST | **PROCEED** (incremental) | +| 5. Rust trie | PyO3 (bundled #4) | ~50% standalone | Bundled with #4 | 1.5x-3x standalone, 3x-8x bundled | **BUNDLE with #4** | +| 6. File parallelism | ProcessPoolExecutor | ~5ms/1000 files | Moderate refactor | 1.5x-3x | **DEFER** (after #4) | +| 7. String processing | Rust (bundled #4) | Negative standalone | Bundled with #4 | Negative standalone, 5x-10x bundled | **BUNDLE with #4** | + +## Key Finding: Integration Overhead Negation Analysis + +The critical insight is that **Candidates 5 and 7 have negative net gains if implemented standalone** because the FFI boundary crossing cost exceeds the per-operation savings. They are only viable when bundled with Candidate 4, which keeps all related operations on the Rust side of the boundary. + +This validates the principle: **a function 10x faster but with 8x overhead at the boundary is only 1.25x improvement.** For Candidates 5 and 7, the standalone case is even worse because the boundary must be crossed per-lookup (thousands of times per file) rather than per-file. + +**Candidate 2 is completely inapplicable** due to incorrect driver assumption. + +**Candidate 3 optimizes a non-bottleneck** (microsecond-level operations). + +The only candidates with clear positive ROI accounting for integration overhead are: +1. **orjson** (zero overhead, significant JSON gains) +2. **Rust AST extension** (minimal overhead due to bytes-in/results-out architecture, massive CPU gains) + +--- + +## ADDENDUM: Revised Analysis Based on CPU Profiling Data + +The CPU profiling report (cProfile, 31.2s total, 179M function calls on 352 Python files) **dramatically changes the priority landscape.** The actual hotspots are fundamentally different from those assumed in the language recommendations. + +### Profiling Reality vs. Language Researcher Assumptions + +| Rank | Actual Hotspot | % CPU | Language Researcher Assumption | +|------|---------------|-------|-------------------------------| +| 1 | `find_ending_with` linear scan | 48.3% | Assumed trie was working; recommended Rust trie for data layout improvement | +| 2 | `should_skip_path` pathlib overhead | 13.7% | Not identified as a hotspot | +| 3 | `build_local_variable_type_map` (uncached AST retraversal) | 8.3% | Assumed this was part of general AST processing | +| 4 | Loguru debug logging overhead | 5.9% | Not identified | +| 5 | `identify_structure` (duplicate FS traversal) | 5.0% | Not identified | +| 6 | tree-sitter `QueryCursor.captures` | 2.5% | Assumed this was the primary bottleneck (10x-16x claim) | +| 7 | tree-sitter `Parser.parse` | 0.6% | Assumed this was the primary bottleneck | + +**Tree-sitter operations total 3.1% of CPU time.** The language researcher's Hotspot 1 ("AST Parsing and Traversal, 10x-16x via Rust") targeted an operation that consumes only 3.1% of runtime. A 16x speedup on 3.1% of runtime yields 1.03x total speedup (Amdahl's law). The projected 10x-16x headline number is misleading. + +### Revised Candidate Assessments + +#### NEW CANDIDATE A: Fix `find_ending_with` Linear Scan (Pure Python Fix) + +**Integration strategy:** Pure Python algorithmic fix. No FFI, no new dependencies. + +**Root cause:** `_simple_name_lookup` index has an 80.7% miss rate (22,096 of 27,376 calls). On miss, the code falls back to `[qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")]`, scanning all ~4,500 entries per call. This generates 123.7M `str.endswith()` invocations. + +**Fix options:** +1. **Populate `_simple_name_lookup` more aggressively:** The index only contains entries added via `FunctionRegistryTrie.insert()` which populates `self._simple_name_lookup` via the passed-in reference. The 80.7% miss rate suggests many qualified names are inserted through code paths that bypass the simple name index population. Audit all insertion paths. +2. **Build a suffix index:** Create a `dict[str, set[QualifiedName]]` mapping the last dot-separated segment of every qualified name to its full name. This converts O(n) scans to O(1) lookups. +3. **Cache negative results:** If a suffix has been scanned and yielded no results, cache that fact to avoid re-scanning. + +**Integration overhead:** Zero. This is a bugfix/optimization within existing Python code. +**Projected gain:** Eliminating 15.07s (48.3% of total) would reduce total runtime from 31.2s to ~16.1s. Even a 90% reduction (fixing most misses) saves ~13.5s. +**Net gain:** ~1.9x total speedup from a pure Python fix. +**Risk:** Very low. + +#### NEW CANDIDATE B: Replace pathlib with String Operations in `should_skip_path` + +**Integration strategy:** Pure Python refactor. Replace `Path.relative_to()` (3.39s across 59,012 calls) with `str.removeprefix()` or `os.path.relpath()`. + +**Root cause:** `pathlib.PurePosixPath.relative_to()` creates intermediate path objects on every call. For 59,012 calls, this creates ~118,000 intermediate objects. + +**Fix:** Convert paths to strings at the boundary and use `str.startswith()` / `str.removeprefix()` for prefix checks. The `should_skip_path` function only needs string comparison operations. + +**Integration overhead:** Zero. Internal refactor. +**Projected gain:** 4.29s (13.7%) reduced to ~0.2s (estimated 20x faster for string ops vs pathlib). Saves ~4s. +**Net gain:** ~1.15x total speedup. +**Risk:** Very low. + +#### NEW CANDIDATE C: Cache `build_local_variable_type_map` Results + +**Integration strategy:** Memoize results keyed by (file_path, function_start_line, function_end_line). + +**Root cause:** Called 5,228 times, re-traversing AST nodes that have already been parsed. Multiple functions in the same file trigger independent traversals. + +**Integration overhead:** Memory cost of caching ~5,000 dict results. Estimated ~2MB. +**Projected gain:** 2.59s (8.3%) reduced to ~0.5s (first traversal per function cached, subsequent hits free). Saves ~2s. +**Net gain:** ~1.07x total speedup. +**Risk:** Low. Need to ensure cache is invalidated when files change (already handled by the incremental update system). + +#### NEW CANDIDATE D: Suppress Debug Logging in Production + +**Integration strategy:** Set loguru level to INFO or WARNING during graph building, or use lazy evaluation for debug messages. + +**Root cause:** 85,099 `debug()` calls processed (1.75s) even when debug output is not displayed. + +**Fix options:** +1. Wrap debug calls in `if logger.level <= DEBUG` guards. +2. Use `logger.opt(lazy=True).debug(lambda: ...)` for expensive format strings. +3. Set log level to INFO at the start of `GraphUpdater.run()`. + +**Integration overhead:** Zero. +**Projected gain:** 1.84s (5.9%) reduced to ~0.1s. Saves ~1.7s. +**Net gain:** ~1.06x total speedup. +**Risk:** Very low. Debug output is not needed during normal operation. + +#### NEW CANDIDATE E: Deduplicate Filesystem Traversal + +**Integration strategy:** `identify_structure()` and `_collect_eligible_files()` both call `rglob("*")` + `should_skip_path()`. Merge into a single traversal pass. + +**Integration overhead:** Moderate refactor of the two-pass architecture. +**Projected gain:** 1.57s (5.0%) eliminated for the duplicate pass. If combined with Candidate B (string paths), the single remaining pass also runs ~20x faster. +**Net gain:** ~1.05x total speedup. +**Risk:** Low. + +### Combined Impact of Pure Python Fixes (Candidates A through E) + +| Fix | Time Saved | % of Total | +|-----|-----------|------------| +| A: Fix find_ending_with | ~13.5s | 43.3% | +| B: String paths | ~4.0s | 12.8% | +| C: Cache type inference | ~2.0s | 6.4% | +| D: Suppress debug logging | ~1.7s | 5.5% | +| E: Deduplicate FS traversal | ~1.5s | 4.8% | +| **Total saved** | **~22.7s** | **72.8%** | +| **Remaining runtime** | **~8.5s** | **27.2%** | + +**Combined speedup: ~3.7x from pure Python fixes alone, with zero integration overhead, zero build system changes, and zero deployment complexity.** + +After these fixes, the remaining 8.5s would be: +- tree-sitter operations: ~1.0s (now 11.8% of reduced total) +- Remaining call resolution: ~2.5s +- File I/O + hashing: ~0.5s +- Graph construction: ~2.5s +- Miscellaneous: ~2.0s + +### Revised Candidate 4 (Rust AST Extension) Assessment + +After pure Python fixes, tree-sitter operations are 1.0s out of 8.5s (11.8%). A 16x Rust speedup on tree-sitter would save 0.94s, reducing total runtime from 8.5s to 7.6s (1.12x improvement). **This is far below the break-even threshold** given the high development cost (~110KB of Python code to port) and build system complexity. + +The Rust AST extension only becomes worthwhile AFTER all pure Python fixes are applied AND the workload scales to much larger codebases (10,000+ files) where tree-sitter operations become a larger fraction of the reduced total. + +### Revised Priority Order + +| Priority | Candidate | Type | Net Gain (on 31.2s total) | Effort | Integration Overhead | +|----------|-----------|------|---------------------------|--------|---------------------| +| **1** | **A: Fix find_ending_with** | **Python bugfix** | **~1.9x (13.5s saved)** | **Low** | **Zero** | +| **2** | **B: String path ops** | **Python refactor** | **~1.15x (4.0s saved)** | **Low** | **Zero** | +| **3** | **C: Cache type inference** | **Python memoization** | **~1.07x (2.0s saved)** | **Low** | **Zero** | +| **4** | **D: Suppress debug logging** | **Config change** | **~1.06x (1.7s saved)** | **Trivial** | **Zero** | +| **5** | **E: Deduplicate FS traversal** | **Python refactor** | **~1.05x (1.5s saved)** | **Low** | **Zero** | +| 6 | 1: orjson | Dependency swap | Marginal on indexing | Trivial | Zero | +| 7 | 4+5+7: Rust AST extension | Rust crate | 1.12x after Python fixes | High | Significant | +| 8 | 6: File parallelism | Architecture change | 1.5x-3x after Python fixes | Moderate | Moderate | + +### Conclusion + +**The top 5 optimizations require zero language rewrites and zero integration overhead.** They fix algorithmic inefficiencies (linear scan), unnecessary object creation (pathlib), redundant computation (uncached type inference, duplicate traversal), and avoidable overhead (debug logging). Together they provide ~3.7x speedup. + +The Rust AST extension (previously the headline recommendation) addresses only 3.1% of actual CPU time and is demoted to priority 7. It should only be reconsidered after Python-level fixes are applied and the workload scales to repositories an order of magnitude larger than the current test case. diff --git a/LANGUAGE_RECOMMENDATIONS.md b/LANGUAGE_RECOMMENDATIONS.md new file mode 100644 index 000000000..fb2cd7d24 --- /dev/null +++ b/LANGUAGE_RECOMMENDATIONS.md @@ -0,0 +1,423 @@ +# Language Recommendations for Performance Hotspots + +## Executive Summary + +**CPU profiling reveals that 48.3% of total runtime is spent in a single Python function** (`FunctionRegistryTrie.find_ending_with()`) performing a linear scan fallback with 123.7M `str.endswith()` calls. This is a pure algorithmic bottleneck, not a language limitation, and fixing the simple name lookup index (80.7% miss rate) would nearly halve total runtime with zero language rewrite. + +After addressing algorithmic issues (Phase 0: ~3.7x total improvement from pure Python fixes), **Rust via PyO3** is the recommended target language for the remaining CPU-bound hotspots (AST wrapper overhead, trie operations, call resolution). For serialization, **orjson** (Rust-backed) is a drop-in replacement for stdlib json. ~~neo4j-rust-ext~~ was retracted (codebase uses Memgraph/pymgclient, not Neo4j). + +**Critical distinction:** This report contains both theoretical per-instruction overhead multipliers (20x-50x from structural analysis) and empirical runtime impact (from CPU profiling). The structural multipliers explain WHY Python is slow at specific operations, but the IMPACT must be measured against the actual profiled runtime distribution via Amdahl's law. After Phase 0 Python fixes reduce the baseline from 31.2s to ~8-10s, the Rust extension (Phase 2) addresses ~20% of the reduced baseline, yielding diminishing but still meaningful returns. + +**Profiling baseline:** 31.2 seconds (cProfile), 14.0s (wall-clock), 179M function calls for indexing 352 Python files. + +--- + +## Hotspot Categories and Recommendations + +### HOTSPOT 1: Tree-sitter AST Parsing and Traversal + +**Files:** `parsers/call_processor.py`, `parsers/call_resolver.py`, `parsers/definition_processor.py`, `parsers/function_ingest.py`, `parsers/structure_processor.py`, all `parsers/handlers/*.py` + +**Workload:** Per-file tree-sitter parsing, QueryCursor iteration, recursive Node traversal, text extraction/decoding from AST nodes. Every file in a repository triggers full AST parsing and multi-pass traversal for functions, classes, calls, and imports. + +**Recommended Language:** Rust (via PyO3/maturin) + +**Projected Speedup:** 20x to 50x (revised upward based on structural analysis) + +**CPU PROFILING DATA:** +- `TypeInferenceEngine.build_local_variable_type_map()`: **2.59s cumulative (8.3%)** across 5,228 calls. Traverses ASTs that have already been parsed, with no caching of results across calls within the same file. +- `QueryCursor.captures()`: **0.78s self time (2.5%)** across 11,028 calls. Already a C extension, largely irreducible. +- `Parser.parse()`: **0.19s self time (0.6%)** across 352 calls. Already C, already fast. +- **Key insight from profiling:** Tree-sitter C operations (parse + captures) total only ~1.0s (3.1% of runtime). The overwhelming majority of AST-related CPU time is in the Python wrapper code doing traversal, type inference, and call resolution around these fast C operations. This validates the Rust rewrite approach: keep tree-sitter's C parsing (fast), move the Python traversal/processing into Rust. +- Loguru debug logging: **1.84s cumulative (5.9%)** across 91,119 calls, including 85,099 debug-level calls processed even when not displayed. This is a Python-level fix (reduce log level or guard debug calls). + +**Evidence:** +- Gauge.sh case study: Moving AST-dependent operations into a Rust extension yielded a 16x speedup (8.7s to 530ms) on a 500k-line codebase. The original Python implementation made ~60M malloc calls and spent 35% of cycles on GC; the Rust version made ~7M malloc calls with no significant GC activity. [Source: gauge.sh/blog/python-extensions-should-be-lazy] +- Tree-sitter is already written in C/Rust. The Python bindings add per-node FFI overhead on every `.child_by_field_name()`, `.text`, and `.children` access. Moving traversal logic into Rust eliminates this boundary-crossing cost entirely. +- ast-grep (Rust-based tree-sitter tool) demonstrates that keeping AST processing in Rust-land and only returning final results to Python is the optimal architecture. [Source: github.com/ast-grep/ast-grep] +- **Structural analysis (CRITICAL severity):** Static analysis confirmed 20x to 50x overhead multiplier per node visit. Every `.parent`, `.children`, `.type` access on tree-sitter nodes goes through Python's descriptor protocol (~50 instructions vs ~1 instruction for a direct struct field read in Rust/C). Specific hot patterns identified: + - `_build_nested_qualified_name()` in `function_ingest.py:344-389`: walks parent chain upward + - `_resolve_inherited_method()` in `call_resolver.py:624-649`: BFS through class_inheritance dict + - `is_method_node()` in `parsers/utils.py:159-173`: walks parent chain for every function node + - `_collect_ancestor_path_parts()` in `function_ingest.py:369-389`: ancestor walk with repeated type checks + - `_is_nested_inside_function()` in `class_ingest/mixin.py:34-45`: another parent chain walk +- **Additional structural overhead:** `bytes.decode("utf-8")` on every `node.text` access (MEDIUM severity, 3x to 5x overhead). The LRU cache at `parsers/utils.py:48-50` mitigates this partially, but `call_processor.py:49` bypasses the cache entirely. In Rust, zero-copy `&[u8]` slices eliminate this entirely. + +**Architecture:** Build a Rust extension that accepts file bytes and a language enum, performs tree-sitter parsing and all traversal passes (function extraction, class extraction, call extraction, import extraction) in Rust, and returns structured results (lists of function definitions, call relationships, class hierarchies) as Python objects. + +**GIL consideration (from concurrency analysis):** Tree-sitter's C extension already releases the GIL during parsing, which enables ThreadPoolExecutor parallelism for the current Python implementation. Any Rust rewrite MUST preserve this property by using `Python::allow_threads` in PyO3 during parsing and traversal, enabling concurrent file processing across threads without process-level parallelism overhead. + +**Why not Cython:** Cython cannot eliminate the Python-to-C FFI overhead of tree-sitter node access, since the bottleneck is the per-node boundary crossing, not Python loop overhead. Rust allows direct tree-sitter C API access without Python object creation. + +**Why not Go:** Go's FFI to C (cgo) has higher overhead than Rust's native C interop. Go's garbage collector would reintroduce the GC pauses that are a key problem in the Python implementation. PyO3 is a more mature Python interop story than Go's limited options (gopy, cgo+ctypes). + +--- + +### HOTSPOT 2: FunctionRegistryTrie Operations + +**Files:** `graph_updater.py` (FunctionRegistryTrie class), `parsers/call_resolver.py` + +**Workload:** Trie insertion and lookup for qualified function names. Every function/method/class definition triggers a trie insert (string splitting on `.`, nested dict traversal). Every call resolution triggers trie lookups, often with multiple fallback strategies (direct lookup, inheritance chain walking, simple name fallback). + +**Recommended Language:** Rust (via PyO3/maturin) + +**Projected Speedup:** 10x to 50x on the post-fix baseline (NOT on the current 15s runtime) + +**IMPORTANT CONTEXT (from integration-architect):** The 10x-50x speedup applies to trie operations AFTER the algorithmic index fix (Priority 0a). After fixing the `_simple_name_lookup` 80.7% miss rate, trie operations drop from 15s to under 1s in pure Python. The Rust trie's 10x-50x improvement then applies to an operation taking <1s, yielding <1s additional savings. The algorithmic fix alone yields ~2x on total runtime. The Rust rewrite is justified by (a) GIL release enabling thread parallelism and (b) cumulative savings across all trie/string operations, but the root cause is an algorithmic bug, not a language limitation. + +**CPU PROFILING DATA (the #1 finding):** +- `find_ending_with()` at `graph_updater.py:156`: **7.91s self time (25.3%), 15.07s cumulative (48.3%)** across 27,376 calls +- Root cause: The `_simple_name_lookup` index has an **80.7% miss rate** (22,096 of 27,376 calls miss). On each miss, the code falls back to a linear scan: `[qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")]`, triggering **123.7M `str.endswith()` calls** (7.21s self time) +- Called 26,950 times from `CallResolver._try_resolve_via_trie()`, the last-resort call resolution strategy +- **This single function accounts for nearly half of all CPU time. The trie data structure exists but is bypassed in favor of the linear fallback in most cases.** +- **CRITICAL: Fix the simple name lookup index first (Python algorithmic fix).** A proper reverse index mapping simple names to qualified names would eliminate the linear scan entirely, reducing this from 15.07s to sub-second. This is the highest-ROI optimization in the entire codebase. Note: even after the algorithmic fix, Python's per-call `str.endswith()` overhead is 5x to 10x what Rust byte-slice comparisons would cost (structural analysis cross-reference), so the Rust trie rewrite remains valuable for the remaining lookup operations. + +**Evidence for language rewrite (after algorithmic fix):** +- **Concurrency analysis confirms this is GIL-bound:** Pure Python trie/dict operations in `FunctionRegistryTrie` and `CallResolver` hold the GIL throughout, preventing any thread-level parallelism. The concurrency analyst estimates 10x to 50x speedup from moving this to native code. This is the strongest case for a Rust rewrite since it eliminates both per-operation overhead AND the GIL bottleneck. +- The current implementation uses nested Python dicts as trie nodes, which means every level of trie traversal creates Python string objects and performs dict hash lookups with full Python object overhead. +- **Structural analysis (HIGH severity):** Python dicts carry 50 to 80 bytes overhead per entry plus hash computation. Each `in` or `[]` lookup involves: hash the key string (O(n) for string length), probe the hash table, compare keys. In Rust, a `HashMap` has similar algorithmic complexity but with inline storage, no reference counting, and cache-friendly memory layout. Specialized data structures (arena-allocated tries, interned string IDs) are practical in systems languages but impractical in Python due to the object model. +- **String overhead (HIGH severity, 5x to 15x):** Qualified names are constructed, split, compared, and looked up thousands of times per file. Each `.split(".")` allocates a new list of new string objects. Each f-string creates a new heap allocation. `_calculate_import_distance()` at `call_resolver.py:651-671` splits both strings and compares elementwise. In Rust, these would be zero-copy string views or stack-allocated slices. +- Rust trie implementations (radix_trie crate) store data contiguously in memory with no per-node heap allocation, eliminating GC pressure. For high-miss-rate lookups (common in call resolution with fallback chains), optimized Rust tries outperform Python dicts. [Source: dev.to/timclicks/two-trie-implementations-in-rust] +- The Gauge.sh case study showed that moving data structures out of Python and into compact Rust structs reduced malloc calls by 8.5x, directly relevant to this trie-heavy workload. +- PyO3 achieves 92% of pure Rust performance for data structure operations while maintaining full Python interoperability. [Source: pyo3.rs/main/performance] + +**Architecture:** First, fix the `_simple_name_lookup` index to cover the 80.7% miss cases (Python fix). Then, implement `FunctionRegistryTrie` as a Rust struct exposed via PyO3. The `insert()`, `get()`, and `find_ending_with()` methods accept Python strings, perform all trie operations in Rust, and return results. The `__contains__` check (used heavily in call resolution) stays in Rust. Use Rust's `lasso` or `string-interner` crate for interned string IDs to eliminate the qualified name duplication across trie, `_entries`, `simple_name_lookup`, and `import_mapping` (memory profiling shows 3.5 MiB for 10k entries in Python vs ~400 KiB estimated in Rust with interning, a 9x reduction). + +**Convergence point (CPU + memory):** This is the strongest single rewrite target in the codebase. FunctionRegistryTrie is simultaneously the #1 CPU hotspot (48.3%) AND carries 9x memory overhead. A Rust replacement addresses both dimensions in one component. + +**Why not Cython:** Cython would help with loop overhead but cannot change the fundamental data layout. The bottleneck is Python dict overhead per trie node, which requires a different data structure (Rust's contiguous memory layout). + +--- + +### HOTSPOT 3: JSON Serialization/Deserialization for Graph Data + +**Files:** `graph_loader.py`, `graph_updater.py`, `services/graph_service.py` + +**Workload:** Loading and saving large graph JSON files (nodes, relationships, properties). The `GraphLoader.load()` method reads potentially multi-megabyte JSON files. The `GraphUpdater` serializes graph data for Neo4j ingestion. + +**Recommended Language:** Drop-in replacement with orjson (Rust-backed) + +**Projected Speedup:** 5x to 15x + +**Evidence:** +- orjson (written in Rust) is 2x to 15.8x faster than Python's stdlib json, depending on payload size. For large payloads (>1MB), gains are 10x or more. [Source: medium.com/codeelevation/want-500-faster-json-in-python-try-orjson] +- orjson uses SIMD (AVX2) for parallel UTF-8 validation and string escaping, scanning 32 bytes at once vs byte-by-byte. [Source: github.com/ijl/orjson] +- Memory usage is 75% lower peak RSS, which matters for large graph files. +- For a 10K-record benchmark, orjson achieved 820 MB/s serialization vs json's 52 MB/s (15.8x). + +**Architecture:** Replace `import json` with `import orjson` throughout the codebase. This is the lowest-effort, highest-ROI optimization. orjson is a drop-in replacement for most use cases. The only API difference is that `orjson.dumps()` returns bytes instead of str. + +**Why this over a full rewrite:** The JSON parsing itself is the bottleneck, not the surrounding Python code. orjson already provides native Rust performance for this specific operation. Writing a custom Rust extension for JSON handling would duplicate orjson's work. + +--- + +### ~~HOTSPOT 4: Neo4j Driver Communication~~ RETRACTED + +**CORRECTION (from integration-architect):** This codebase uses **Memgraph via `pymgclient`** (a C extension), NOT the Neo4j Python driver. There is no `neo4j` dependency in `pyproject.toml`. The `neo4j-rust-ext` package patches the Neo4j driver's PackStream implementation and has **zero effect** on `pymgclient`. This recommendation is retracted. + +`pymgclient` is already a C extension with low overhead. CPU profiling confirms database serialization (protobuf) is negligible at 0.17s total. No language rewrite is needed for the database communication layer. + +--- + +### HOTSPOT 5: Embedding Cache Hashing + +**Files:** `embedder.py` (EmbeddingCache class) + +**Workload:** SHA256 hashing of code snippets for cache key generation. Each snippet is hashed via `hashlib.sha256(content.encode()).hexdigest()`. For large codebases, thousands of snippets are hashed. + +**Recommended Language:** Conditional: BLAKE3 (Rust-backed) if profiling confirms hashing as bottleneck + +**Projected Speedup:** 4x to 10x (for hashing only) + +**Evidence:** +- Python's hashlib SHA256 is already implemented in C (OpenSSL), so it's reasonably fast. Rust SHA256 achieves roughly 1.5x over Python's hashlib. [Source: users.rust-lang.org/t/hash-digest-performance-rust-vs-python/89686] +- If hashing is confirmed as a bottleneck, switching to BLAKE3 (via the `blake3` Python package, which is Rust-backed) provides 4x to 10x speedup over SHA256 because BLAKE3 is inherently faster and uses SIMD parallelism. [Source: devtoolspro.org/articles/sha256-alternatives-faster-hash-functions-2025/] +- The `blake3` Python package is a drop-in hash function replacement. API change is minimal: `blake3.blake3(content.encode()).hexdigest()`. + +**Architecture:** Replace `hashlib.sha256` with `blake3.blake3` in the `EmbeddingCache._content_hash()` method. This is a one-line change. Note: existing caches would need to be regenerated since hash values will differ. + +**CPU PROFILING RESULT: Hashing is NOT a bottleneck.** `_hash_file()` costs only 0.04s total (0.1%) across 453 calls. SHA-256 hashing is fast and not worth optimizing. BLAKE3 swap is deprioritized. + +**Additional structural insight (MEDIUM severity):** The embedding pipeline at `embedder.py:109-126` and `unixcoder.py:97-107` crosses the Python/C boundary 3+ times per embedding: Python `list[list[int]]` to `torch.tensor` (copy), through PyTorch C++ backend (efficient), `.cpu().numpy()` (copy), `.tolist()` (N allocations for N-dim vector). Each crossing involves full memory copies and new container allocations. In Rust with `tch-rs`, tensor references can be held throughout without conversion overhead, providing 2x to 3x improvement on the embedding data path itself (separate from model inference time). + +--- + +### HOTSPOT 6: File Traversal and Processing Pipeline + +**Files:** `parsers/structure_processor.py`, `graph_updater.py` (file walking, `should_skip_path`) + +**Workload:** Walking repository directories, reading files, determining language, applying gitignore/skip rules, and feeding files into the parser pipeline. + +**Recommended Language:** Python (with concurrency improvements) + +**Projected Speedup:** 3x to 5x (via pathlib fix + deduplication, not language rewrite) + +**CPU PROFILING DATA:** +- `should_skip_path()`: **4.29s cumulative (13.7%)** across 59,270 calls. Dominated by `pathlib.relative_to()` at 3.18s across 54,519 calls, which creates intermediate `PurePosixPath` objects internally. +- `_collect_eligible_files()`: **4.71s cumulative (15.1%)** from a single call. The `rglob` itself costs only ~0.4s, but `should_skip_path` per file dominates. +- `identify_structure()`: **1.57s cumulative (5.0%)** from a single call. Performs a **duplicate** `rglob("*")` pass with separate `should_skip_path()` calls. +- **Key insight from profiling:** File traversal is NOT I/O-bound as originally assumed. The bottleneck is Python pathlib object overhead (creating intermediate Path objects for every `relative_to()` call), not filesystem I/O (`posix.scandir` costs only 0.42s). Using string-based path operations instead of pathlib would eliminate most of this overhead. Additionally, merging the duplicate traversal passes would cut FS stat calls in half. + +**I/O PROFILING DATA (confirms NOT I/O-bound):** +- Actual disk I/O for the entire workload totals only **0.85s (6.1% of 14.0s)**. File reads: 0.02s, hashing: 0.02s, protobuf serialization: 0.01s, JSON cache: 0.001s. +- `pathlib.relative_to()` performs **zero disk I/O**. It constructs intermediate `PurePosixPath` objects via `__init__`, `is_relative_to`, `with_segments`, `_from_parsed_parts`. Measured at **10.6 us/call**. +- **String slice equivalent: 0.065 us/call (163x faster).** This is the measured speedup from the I/O profiler for replacing `pathlib.relative_to()` with string slicing. +- Duplicate `rglob("*")` traversals cost ~0.80s combined (two passes of ~0.40s each scanning 59,283 entries). + +**Evidence:** +- The `rglob` filesystem traversal itself is fast (0.42s). The 4.29s in `should_skip_path` is pure Python object creation overhead from pathlib. +- The real opportunity is (a) replacing `pathlib.relative_to()` with string slicing (163x faster per call), and (b) merging the two separate `rglob` passes into one. + +**Architecture:** Keep file traversal in Python. Fix pathlib overhead first (Priority 0b). Thread-based parallelism for file processing is less impactful than originally estimated: CPU profiling shows tree-sitter parsing is only 0.6% of total CPU, so parallelizing parsing yields minimal gains. The dominant bottleneck (48.3%) is in the post-parsing call resolution phase, which is sequential and GIL-bound. + +**Why not Rust for traversal:** The per-file processing calls into tree-sitter (C library) and constructs Python objects. The overhead is in path manipulation (pathlib), not traversal I/O. A string-based path fix in Python is sufficient. + +**Revised concurrency estimate (from concurrency analysis):** Original 3x-6x estimate for parallel file parsing revised downward since tree-sitter parsing is only 0.6% of CPU. Parallelism gains are secondary to algorithmic and native extension improvements. + +**Note (from concurrency analysis):** The Memgraph/Neo4j flush layer already uses ThreadPoolExecutor with separate connections, so the I/O layer is well structured and does not need a language rewrite. + +--- + +### HOTSPOT 7: String Processing in Call Resolution + +**Files:** `parsers/call_resolver.py`, `parsers/import_processor.py` + +**Workload:** Regex matching (`_SEPARATOR_PATTERN`, `_CHAINED_METHOD_PATTERN`), string splitting, qualified name construction (f-string concatenation), dict lookups in import maps. + +**Recommended Language:** Rust (bundled with Hotspot 1 and 2 rewrites) + +**Projected Speedup:** 5x to 20x (as part of the combined AST processing extension) + +**Evidence:** +- Rust string processing is 10x to 80x faster than Python for CPU-intensive operations. [Source: blog.jetbrains.com/rust/2025/11/10/rust-vs-python-finding-the-right-balance] +- The call resolution logic is tightly coupled to AST traversal (it runs during the call processing pass). Moving it into the same Rust extension as Hotspot 1 eliminates all Python object creation overhead for intermediate strings. +- The regex patterns used are simple (separator splitting, method chaining detection) and would be even faster using Rust's `regex` crate, which uses finite automata rather than Python's backtracking regex engine. +- **Structural analysis: Interpreter loop overhead (HIGH severity, 5x to 20x).** The innermost loops at `call_processor.py:285-328`, `import_processor.py:164-172`, and `graph_updater.py:405-434` execute ~20 to 30 Python bytecode instructions per iteration just for control flow (dynamic dispatch, isinstance checks with MRO traversal, reference count updates), before the actual work in called methods. A compiled language would inline these calls and eliminate dispatch overhead entirely. + +**Architecture:** Include call resolution logic in the Hotspot 1 Rust extension. The Rust code performs AST traversal, call name extraction, and call resolution in a single pass, returning only the final resolved call relationships to Python. + +--- + +## CPU Profiling Summary (from cProfile) + +**Workload:** `GraphUpdater.run(force=True)` indexing 352 Python files, 31.2s total, 179M function calls. + +| Rank | Function | Self Time | Cum. Time | % Total | Calls | Root Cause | +|---|---|---|---|---|---|---| +| 1 | `find_ending_with` | 7.91s | 15.07s | 48.3% | 27,376 | Linear scan fallback, 123.7M `endswith` calls | +| 2 | `should_skip_path` | 0.07s | 4.29s | 13.7% | 59,270 | Pathlib `relative_to` overhead (3.18s) | +| 3 | `build_local_variable_type_map` | 0.004s | 2.59s | 8.3% | 5,228 | Repeated AST traversal, no caching | +| 4 | Loguru logging | 0.41s | 1.84s | 5.9% | 91,119 | Debug-level overhead at high call volume | +| 5 | `identify_structure` | 0.02s | 1.57s | 5.0% | 1 | Duplicate FS traversal + should_skip_path | +| 6 | `QueryCursor.captures` | 0.78s | 0.78s | 2.5% | 11,028 | C extension, largely irreducible | +| 7 | `Parser.parse` | 0.19s | 0.19s | 0.6% | 352 | C extension, already fast | +| 8 | `_hash_file` | 0.001s | 0.04s | 0.1% | 453 | Negligible | + +**Key observations:** +1. 48.3% of CPU is in a single function with an algorithmic fix available (index miss rate) +2. Tree-sitter C operations (parse + captures) total only 1.0s (3.1%), confirming the bottleneck is Python wrapper code +3. Protobuf serialization is negligible (0.17s total) +4. File hashing is negligible (0.04s total) + +--- + +## Structural Performance Ceilings (from Static Analysis) + +The static-pattern-analyst identified 9 categories of Python runtime overhead that create inherent performance ceilings. These are organized by severity: + +| Severity | Pattern | Overhead Multiplier | Rewrite Benefit | +|---|---|---|---| +| CRITICAL | AST tree traversal (pointer chasing + dynamic dispatch) | 20x-50x per node visit | Highest | +| CRITICAL | GIL preventing parallel parsing/resolution | Linear with core count | Highest | +| HIGH | String operations on qualified names | 5x-15x | High | +| HIGH | Dictionary lookups in hot loops | 3x-10x | High | +| HIGH | Interpreter loop overhead in tight iteration | 5x-20x | High | +| MEDIUM | `bytes.decode("utf-8")` on every node text access | 3x-5x | Moderate | +| MEDIUM | Object headers + reference counting on all intermediates | 2x-5x memory reduction | Moderate | +| MEDIUM | Embedding data format conversions (Python/Tensor/NumPy) | 2x-3x per embedding | Low (model dominates) | +| MEDIUM-HIGH | File I/O with Path objects (revised upward: CPU profiling shows 13.7% of CPU) | 3x-5x | Significant (pathlib overhead, not I/O) | + +**Key insight:** The CRITICAL and HIGH severity patterns are all concentrated in the same code: the parser/ingestion pipeline (Hotspots 1, 2, 7). A single Rust extension covering AST traversal, trie operations, and call resolution would address 5 of the 9 overhead categories simultaneously. + +**Diffuse overhead note:** Object header overhead (16 bytes per object minimum) and reference counting affect all Python code. Every intermediate `tuple`, `list[str]` from `.split()`, and NamedTuple is heap-allocated with refcounting. A `tuple[str, str]` is ~100 bytes in Python vs ~16 bytes in Rust (stack-allocated). This is not directly addressable per hotspot but is eliminated automatically when hot paths move to Rust. + +## Memory Profiling Data (from tracemalloc) + +Memory profiling confirms that Python's object model creates significant memory overhead in the same hotspot areas identified by CPU profiling and structural analysis: + +| Structure | Python (measured) | Estimated Rust | Memory Ratio | +|---|---|---|---| +| Tree-sitter AST node wrappers | 87.3 MiB (343 files, 1.67M wrapper objects) | ~5-10 MiB (direct C struct access) | 9-17x | +| EmbeddingCache `list[float]` | 48.6 MiB (2k embeddings) | ~6 MiB (packed f32 arrays) | 8x | +| import_mapping | 5.6 MiB (2k modules) | ~1.5 MiB | 3.7x | +| rel_groups | 3.6 MiB | ~800 KiB | 4.5x | +| FunctionRegistryTrie | 3.5 MiB (10k entries, 13.2k intermediate dicts) | ~400 KiB (arena-allocated trie) | 9x | + +**Key memory findings:** +1. **AST node wrappers (87.3 MiB)** are the largest memory consumer. Each `node.children` access creates new Python Node wrapper objects around C pointers. A Rust extension performing extraction natively would avoid all wrapper allocation, reinforcing the Hotspot 1 recommendation. +2. **EmbeddingCache (48.6 MiB)** uses Python `float` objects (28 bytes each). A 768-dim embedding as `list[float]` uses ~21.5 KiB vs ~6 KiB as packed f32. Switching to numpy arrays (Python-level fix) would provide 4x reduction; Rust packed f32 arrays would be optimal. +3. **FunctionRegistryTrie (3.5 MiB)** has 13.2k intermediate Python dict objects (64+ bytes each) for 10k entries. A Rust compact trie with byte slices or arena allocation would use ~400 KiB. +4. **String duplication:** Qualified names are stored in multiple structures (trie, `_entries`, `simple_name_lookup`, `import_mapping`). Python's string interning does not cover long qualified names. Rust string interning via a global interner would deduplicate these. + +--- + +## Non-Language Optimizations (Algorithmic / Python-Level) + +CPU profiling and concurrency analysis identified multiple high-impact optimizations that do NOT require a language rewrite. **These should be implemented first** as they collectively address over 70% of CPU time. + +### ALGORITHMIC 0: Fix `find_ending_with()` Simple Name Index (THE #1 PRIORITY) + +**Issue:** `FunctionRegistryTrie.find_ending_with()` at `graph_updater.py:156` accounts for **48.3% of total CPU time** (15.07s of 31.2s). The `_simple_name_lookup` index has an 80.7% miss rate, causing a linear scan fallback with 123.7M `str.endswith()` calls. + +**Projected Speedup:** ~2x on total runtime (eliminating 15s from a 31s run) + +**Action:** Build a proper reverse index mapping simple (unqualified) names to their list of qualified names. Populate it during trie insertion. This converts the O(N) linear scan into an O(1) dict lookup per call. This is a pure Python data structure fix requiring minimal code changes. + +### ALGORITHMIC 0b: Replace pathlib `relative_to()` with String Operations + +**Issue:** `should_skip_path()` consumes **4.29s (13.7%)** due to pathlib's `relative_to()` creating intermediate `PurePosixPath` objects 54,519 times. The actual filesystem I/O is only 0.42s. + +**Projected Speedup:** ~3x on the file collection phase (reducing 4.29s to ~0.5s) + +**Action:** Replace `path.relative_to(base)` with `str(path)[len(str(base))+1:]` or equivalent string slicing. Merge the duplicate `rglob("*")` passes from `_collect_eligible_files()` and `identify_structure()` into a single traversal. Additionally, pre-filter at directory level: walk the tree manually and skip ignored directories (.git, __pycache__, node_modules, site) immediately rather than enumerating all 59K descendants and filtering after. This would reduce traversal from 59K to ~600 paths. + +### ALGORITHMIC 0c: Cache Type Inference Results Per File + +**Issue:** `build_local_variable_type_map()` consumes **2.59s (8.3%)** across 5,228 calls, re-traversing ASTs that have already been parsed with no caching across calls within the same file. + +**Projected Speedup:** ~2x to 5x on the type inference phase + +**Action:** Memoize type inference results per function AST node. Since the AST is immutable after parsing, results are safe to cache. + +### ALGORITHMIC 0d: Reduce Debug Logging Overhead + +**Issue:** Loguru logging consumes **1.84s (5.9%)** across 91,119 calls, including 85,099 debug-level calls processed even when not displayed. + +**Projected Speedup:** Eliminates ~1.8s (5.9% of total runtime) + +**Action:** Guard debug log calls with `if logger.isEnabledFor(DEBUG):` or use lazy formatting, or set the minimum log level to INFO in production. + +### ALGORITHMIC 0e: Use Compact JSON for Graph Export + +**Issue:** `_write_graph_json()` in `main.py:744` uses `json.dump(graph_data, f, indent=2)` which is **8x slower** than compact JSON (86ms vs 11ms for 10K nodes) and produces 1.5x larger output. + +**Projected Speedup:** 8x on graph JSON export + +**Action:** Use compact JSON (no indent) for machine consumption. Add a separate `--pretty` flag for human-readable output. + +### ALGORITHMIC 0f: Binary Format for Embedding Cache + +**Issue:** 500 embeddings (768-dim float vectors) stored as JSON = 6.3MB, save = 149ms, load = 38ms. Each embedding is serialized as a JSON array of 768 float values with full decimal precision. + +**Projected Speedup:** 10x+ on embedding cache I/O (both size and speed) + +**Action:** Use numpy `.npy` or `.npz` format for embedding vectors. A 768-dim float32 vector is 3 KiB in binary vs ~15 KiB in JSON text. + +### ALGORITHMIC 1: Batch Embedding API Usage + +**Issue:** The `embed_code_batch` function exists but is unused in the main pipeline. The embedding phase calls `embed_code` per-item instead. + +**Projected Speedup:** Potentially 5x to 12x on the embedding phase (based on batching reducing HTTP round-trip overhead and enabling server-side batching). The Baseten case study showed 12x throughput improvement from proper batching with GIL release. [Source: baseten.co/blog/your-client-code-matters-10x-higher-embedding-throughput-with-python-and-rust/] + +**Action:** Fix the Python pipeline to use `embed_code_batch`. This is a Python-level fix with zero language rewrite cost. + +### ALGORITHMIC 2: Incremental Call Re-Resolution + +**Issue:** The realtime updater (`realtime_updater.py`) performs full call re-resolution on every file change, reprocessing the entire function registry and call graph. + +**Projected Speedup:** 10x to 100x for incremental updates (per the concurrency analysis), since only the changed file's calls and its direct dependents need re-resolution. + +**Action:** Implement incremental call resolution that tracks which qualified names changed and only re-resolves calls that reference those names. This is an algorithmic improvement, not a language choice. + +**These two Python-level fixes should be implemented BEFORE the Rust extension work**, as they may reduce the urgency of the more expensive rewrites. + +--- + +## Language Comparison Matrix + +| Criterion | Rust (PyO3) | Cython | Go | Mojo | Zig | +|---|---|---|---|---|---| +| **Raw performance** | Excellent (C-level) | Good (C-level for numeric) | Good (2x slower than Rust) | Excellent (claims C-level) | Excellent (C-level) | +| **Python FFI quality** | Excellent (PyO3 is mature, zero-copy numpy, vectorcall) | Native (compiles to C extension) | Poor (cgo+ctypes, limited) | Poor (early stage, no stable FFI) | Poor (C ABI only, no Python tooling) | +| **Ecosystem for this workload** | Excellent (tree-sitter crate, regex, serde_json, radix_trie) | Limited (no tree-sitter, string ops need C) | Moderate (tree-sitter-go exists) | None (no tree-sitter, no graph libs) | Limited (tree-sitter C API via @cImport) | +| **Memory safety** | Excellent (borrow checker) | Poor (manual, C-level) | Good (GC, but adds pauses) | Unknown (early stage) | Moderate (manual, but safer than C) | +| **Build complexity** | Moderate (maturin makes it easy) | Low (cythonize) | High (separate binary, IPC needed) | High (Modular toolchain only) | High (no Python tooling) | +| **Developer availability** | Growing (22% increase in Python+Rust developers in 2025) | Declining | Low for Python extensions | Very low | Very low | +| **Real-world precedent** | ruff, uv, polars, pydantic-core, orjson | numpy, scipy (legacy) | None for similar tools | None for similar tools | None for similar tools | + +### Why Rust is the clear winner for this codebase: + +1. **PyO3 maturity:** PyO3 is the most mature Python FFI framework, with zero-copy mechanisms, vectorcall support, and 92% of pure Rust performance. [Source: pyo3.rs/main/performance] + +2. **Tree-sitter native support:** Tree-sitter's runtime is written in C/Rust. Rust can call the tree-sitter C API directly without any Python intermediary, eliminating the per-node FFI overhead that is the primary bottleneck. + +3. **Industry precedent:** The most successful Python performance tools of 2024-2025 are all Rust-backed: ruff (linter, 10-100x faster), uv (package manager), polars (DataFrame, 5-10x faster), pydantic-core (validation, 17x faster), orjson (JSON, 15x faster). [Source: thenewstack.io/rust-pythons-new-performance-engine/] + +4. **maturin build system:** maturin (also by the PyO3 team) simplifies building and distributing Rust Python extensions as standard wheels. No complex build system integration needed. + +--- + +## Prioritized Implementation Order + +### Phase 0: Python Algorithmic Fixes (addresses ~72% of CPU time) + +| Priority | Fix | Effort | CPU Time Saved | % of Total | +|---|---|---|---|---| +| 0a | Fix `find_ending_with` simple name index | Very low | ~15s | 48.3% | +| 0b | Replace pathlib `relative_to` with string ops + merge duplicate rglob | Low | ~4s | 13.7% | +| 0c | Cache type inference results per file | Low | ~2s | 8.3% | +| 0d | Reduce debug logging overhead | Very low | ~1.8s | 5.9% | +| 0e | Batch embedding API usage | Very low | TBD (embedding phase) | TBD | +| 0f | Incremental call re-resolution | Medium | 10x-100x on realtime | N/A (realtime only) | + +**Phase 0 collectively addresses ~72% of measured CPU time (22.8s of 31.2s) with pure Python changes.** After Phase 0, the expected baseline would be ~8-10s for the same 352-file workload. + +### Phase 1: Drop-in Rust-backed Libraries (zero code changes) + +| Priority | Library | Effort | Expected Speedup | +|---|---|---|---| +| 1a | JSON serialization (orjson) | Very low (dependency swap) | 5x-15x on JSON ops | +| ~~1b~~ | ~~Neo4j driver (neo4j-rust-ext)~~ | ~~RETRACTED~~ | ~~Inapplicable: codebase uses Memgraph/pymgclient, not Neo4j~~ | +| 1b | Embedding hash (BLAKE3) | Very low (one-line change) | 4x-10x on hashing (confirmed negligible: 0.04s) | + +**Note from profiling:** File hashing (`_hash_file`) is only 0.04s total (0.1%), and protobuf serialization is 0.17s total. These are negligible. BLAKE3 (Priority 1b) can be deprioritized. orjson remains worthwhile for larger codebases. The neo4j-rust-ext recommendation was retracted because this codebase uses Memgraph via `pymgclient` (C extension), not the Neo4j Python driver. + +### Phase 2: Rust Extension (addresses remaining CPU-bound overhead) + +| Priority | Component | Effort | Expected Speedup | +|---|---|---|---| +| 2a | AST traversal + type inference (Rust) | High (new extension) | 20x-50x on AST processing | +| 2b | Trie + call resolution (Rust) | Medium (extend 2a) | 10x-50x on lookups (GIL-bound) | + +**Phase 2 should be implemented as a single `codebase-rag-core` Rust crate**, since AST traversal, trie operations, and call resolution are tightly coupled. The Rust extension MUST release the GIL via `Python::allow_threads` during parsing and traversal to preserve thread-level parallelism. + +**Amdahl's law caveat (from integration-architect):** Tree-sitter C operations (parse + captures) are only 3.1% of CPU time. A 16x speedup on 3.1% yields only 1.03x total improvement. The value of the Rust AST extension is NOT in speeding up tree-sitter itself (already fast C code), but in eliminating the Python wrapper overhead around it: type inference re-traversal (8.3%), call resolution string operations, and interpreter loop overhead in the tight iteration loops. These Python-side AST costs total ~20% of CPU, making the combined Phase 2 extension worthwhile after Phase 0 algorithmic fixes are applied. + +### Phase 3: Architecture Improvements + +| Priority | Change | Effort | Expected Speedup | +|---|---|---|---| +| 3a | File processing parallelism (ThreadPoolExecutor) | Medium | Downgraded: marginal gains | + +**Phase 3 is downgraded based on revised analysis.** CPU profiling shows tree-sitter parsing is only 0.6% of CPU, and the file processing bottleneck (`pathlib.relative_to` at 13.7%) is GIL-bound pure Python that ThreadPoolExecutor cannot parallelize. The pathlib fix (Phase 0b, string slicing, 163x faster) is the correct solution, not parallelism. ProcessPoolExecutor for call resolution is also impractical: memory profiling shows 170 MiB peak memory, making serialization cost too high. The Rust PyO3 native extension (Phase 2) is the only viable path for parallelizing call resolution, as it can release the GIL via `Python::allow_threads`. + +--- + +## Sources + +- [Gauge.sh: Python extensions should be lazy](https://www.gauge.sh/blog/python-extensions-should-be-lazy) - 16x speedup moving AST processing to Rust +- [Neo4j Python Driver 10x Faster With Rust](https://neo4j.com/blog/developer/python-driver-10x-faster-with-rust/) - neo4j-rust-ext benchmarks +- [Baseten: 12x higher embedding throughput with Python and Rust](https://www.baseten.co/blog/your-client-code-matters-10x-higher-embedding-throughput-with-python-and-rust/) - PyO3 GIL release pattern +- [orjson: 500% Faster JSON in Python](https://medium.com/codeelevation/want-500-faster-json-in-python-try-orjson-powered-by-rust-22995c25c312) - JSON serialization benchmarks +- [PyO3 Performance Guide](https://pyo3.rs/main/performance) - FFI overhead characteristics +- [Rust: Python's New Performance Engine](https://thenewstack.io/rust-pythons-new-performance-engine/) - Industry adoption trends +- [Comparing Cython to Rust for Python Extensions](https://willayd.com/comparing-cython-to-rust-evaluating-python-extensions.html) - Graph algorithm benchmarks +- [SHA-256 Alternatives: BLAKE3 vs SHA-3 Speed Comparison](https://devtoolspro.org/articles/sha256-alternatives-faster-hash-functions-2025/) - Hash function benchmarks +- [Neo4j Performance Recommendations](https://neo4j.com/docs/python-manual/current/performance/) - Batch loading best practices +- [JetBrains Rust vs Python 2025](https://blog.jetbrains.com/rust/2025/11/10/rust-vs-python-finding-the-right-balance-between-speed-and-simplicity/) - String processing benchmarks +- [Databooth: Benchmarking Python with Cython, C, C++, and Rust](https://www.databooth.com.au/posts/py-num-bench/) - Extension comparison +- [Cython, Rust, and more: choosing a language for Python extensions](https://pythonspeed.com/articles/rust-cython-python-extensions/) - When to use each approach +- [ast-grep](https://github.com/ast-grep/ast-grep) - Rust tree-sitter code analysis tool +- [Rust trie implementations](https://dev.to/timclicks/two-trie-implementations-in-rust-ones-super-fast) - Trie performance +- [Corrode: Migrating from Python to Rust](https://corrode.dev/learn/migration-guides/python-to-rust/) - Migration guide +- [Datadog: Migrating static analyzer from Java to Rust](https://www.datadoghq.com/blog/engineering/how-we-migrated-our-static-analyzer-from-java-to-rust/) - Code analysis tool migration diff --git a/LICENSE b/LICENSE index fd189113e..4765780e7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) [2025] [Vitali Avagyan] +Copyright (c) 2025 Vitali Avagyan Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/PRIORITIZED_SCORECARD.md b/PRIORITIZED_SCORECARD.md new file mode 100644 index 000000000..871d96534 --- /dev/null +++ b/PRIORITIZED_SCORECARD.md @@ -0,0 +1,284 @@ +# Prioritized Scorecard: Rewrite Candidates + +**Baseline:** 31.2s total, 179M function calls, indexing 352 Python files (cProfile) + +## Scoring Methodology + +Each candidate is scored 1 to 5 on six dimensions. The final rank is determined by **Net Score**, which weights measured/projected performance gain and scope of impact highest, while penalizing integration overhead, risk, and maintenance burden. + +**Weights:** Performance Gain (25%) | Memory Improvement (10%) | Integration Feasibility (20%) | Risk & Complexity (20%) | Scope of Impact (15%) | Maintenance Burden (10%) + +**Score key:** 5 = excellent, 4 = good, 3 = moderate, 2 = poor, 1 = unacceptable + +--- + +## Tier 1: ACCEPTED (High confidence, clear positive ROI) + +### Rank 1: Fix `find_ending_with` Linear Scan (Python Bugfix) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 5 | 48.3% of CPU (15.07s). Eliminates 123.7M `str.endswith()` calls. Projected ~1.9x total speedup. | +| Memory Improvement | 3 | Reduces temporary string allocations from linear scans. | +| Integration Feasibility | 5 | Pure Python fix. Zero new dependencies, zero build changes. | +| Risk & Complexity | 5 | Low risk. Fix the 80.7% miss rate in `_simple_name_lookup` index, or build suffix index. | +| Scope of Impact | 5 | Affects every file processed. Dominant bottleneck in the entire pipeline. | +| Maintenance Burden | 5 | No new language, no new build tooling. Standard Python data structure. | +| **Net Score** | **4.80** | | + +**Verdict: PROCEED IMMEDIATELY.** This is a bugfix, not a rewrite. The `_simple_name_lookup` index has an 80.7% miss rate, causing fallback to O(n) linear scan on every call resolution. Fixing the index population or adding a suffix index is a straightforward Python change with the highest ROI of any candidate. + +--- + +### Rank 2: Replace pathlib with String Operations in `should_skip_path` (Python Refactor) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 4 | 13.7% of CPU (4.29s across 59,012 calls). ~20x faster with string ops. | +| Memory Improvement | 4 | Eliminates ~118,000 intermediate Path objects per run. | +| Integration Feasibility | 5 | Internal refactor. No dependencies. | +| Risk & Complexity | 5 | Replace `Path.relative_to()` with `str.removeprefix()`. Straightforward. | +| Scope of Impact | 4 | Affects file traversal (called for every file and directory). | +| Maintenance Burden | 5 | Simpler code than current pathlib usage. | +| **Net Score** | **4.50** | | + +**Verdict: PROCEED.** Convert paths to strings at the boundary and use string comparison. The pathlib object creation overhead is avoidable. + +--- + +### Rank 3: Cache `build_local_variable_type_map` Results (Python Memoization) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 8.3% of CPU (2.59s across 5,228 calls). Saves ~2s. | +| Memory Improvement | 2 | Adds ~2MB cache. Slight memory increase. | +| Integration Feasibility | 5 | Add `@lru_cache` or dict-based memoization. No dependencies. | +| Risk & Complexity | 5 | Keyed by (file_path, function_start_line, function_end_line). Cache invalidation handled by existing incremental update system. | +| Scope of Impact | 3 | Affects call resolution for files with multiple functions. | +| Maintenance Burden | 5 | Standard memoization pattern. | +| **Net Score** | **3.90** | | + +**Verdict: PROCEED.** Standard memoization with minimal memory cost. + +--- + +### Rank 4: Suppress Debug Logging in Production (Config Change) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 5.9% of CPU (1.84s from 85,099 debug calls). Saves ~1.7s. | +| Memory Improvement | 2 | Reduces temporary string allocations from format strings. | +| Integration Feasibility | 5 | Set log level to INFO at start of `GraphUpdater.run()`. One line. | +| Risk & Complexity | 5 | Trivial. Debug output not needed during normal graph building. | +| Scope of Impact | 3 | Affects all debug logging throughout pipeline. | +| Maintenance Burden | 5 | No maintenance cost. | +| **Net Score** | **3.75** | | + +**Verdict: PROCEED.** Trivial change, meaningful gain. + +--- + +### Rank 5: Deduplicate Filesystem Traversal (Python Refactor) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 5.0% of CPU (1.57s). Eliminates duplicate `rglob("*")` + `should_skip_path()` pass. | +| Memory Improvement | 3 | Avoids building duplicate file lists. | +| Integration Feasibility | 4 | Moderate refactor: merge `identify_structure()` and `_collect_eligible_files()` into single traversal. | +| Risk & Complexity | 4 | Requires restructuring two-pass architecture. Not trivial but well-scoped. | +| Scope of Impact | 3 | Affects initial file discovery phase only. | +| Maintenance Burden | 4 | Single-pass is arguably simpler than two-pass. | +| **Net Score** | **3.55** | | + +**Verdict: PROCEED.** Combine with Rank 2 (string paths) for maximum benefit on the file traversal phase. + +--- + +### Rank 6: orjson (Drop-in JSON Replacement) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 5x to 15x on JSON ops. JSON is NOT a dominant hotspot in the profiling data (indexing phase), but significant for graph export and cache I/O. | +| Memory Improvement | 4 | 75% lower peak RSS for JSON operations. | +| Integration Feasibility | 5 | Add dependency, ~10 call sites need minor adjustment (bytes vs str). | +| Risk & Complexity | 5 | Widely adopted (polars, FastAPI). Pre-built wheels for all platforms. | +| Scope of Impact | 2 | JSON ops are a small fraction of total indexing time. Bigger impact on graph export/import. | +| Maintenance Burden | 5 | Drop-in replacement. No ongoing maintenance cost. | +| **Net Score** | **3.50** | | + +**Verdict: PROCEED.** Low effort, low risk, moderate gain on I/O-heavy workflows (export, cache load/save). Not a game-changer for indexing performance. + +--- + +## Tier 2: CONDITIONAL (Worthwhile only after Tier 1 is complete) + +### Rank 7: Rust AST Processing Extension (PyO3/maturin) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 2 | Tree-sitter ops are only 3.1% of CPU BEFORE Python fixes. After Tier 1 fixes (~3.7x speedup), tree-sitter becomes ~11.8% of reduced runtime. A 16x Rust speedup saves 0.94s from 8.5s. Only 1.12x total improvement post-fixes. | +| Memory Improvement | 4 | Eliminates Python object overhead (50-80 bytes per dict entry), reduces malloc calls by ~8x. | +| Integration Feasibility | 2 | ~110KB of Python code to port. 8+ language parsers. Complex multi-language pattern matching. Requires maturin build system, Rust toolchain in CI/Docker, platform-specific wheels. | +| Risk & Complexity | 2 | Large surface area. Tight coupling with existing data structures. Tree-sitter version compatibility. IngestorProtocol callback complexity. | +| Scope of Impact | 3 | Affects all file processing. But only becomes meaningful at 10,000+ file scale. | +| Maintenance Burden | 2 | Introduces Rust into a pure Python project. Requires Rust expertise for ongoing maintenance. Multi-language build complexity. | +| **Net Score** | **2.35** | | + +**Verdict: DEFER.** The integration architect's analysis is decisive: tree-sitter operations consume only 3.1% of actual CPU time. The language researcher's headline claim of 10x to 16x was based on incorrect assumptions about where time was spent. After Tier 1 Python fixes, the remaining 8.5s runtime has tree-sitter at 11.8%, making a 16x Rust speedup yield only 1.12x total. The high development cost (~110KB port, multi-language parsers) and maintenance burden (Rust toolchain, platform-specific wheels) make this poor ROI until the codebase scales an order of magnitude. + +**Reconsider when:** Repository size exceeds 5,000+ files, making tree-sitter operations a larger fraction of total runtime. + +--- + +### Rank 8: File Processing Parallelism (ProcessPoolExecutor) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 3 | 1.5x to 3x after Tier 1 fixes. Limited by sequential pass dependencies (Amdahl's law). | +| Memory Improvement | 1 | Increases memory (per-worker grammar loading, duplicate tries). | +| Integration Feasibility | 3 | Requires restructuring three-pass pipeline. Shared mutable state (trie, import maps) needs synchronization. | +| Risk & Complexity | 3 | Tree-sitter objects not serializable across process boundaries. Worker initialization overhead (~50ms per worker). | +| Scope of Impact | 3 | Affects per-file processing throughput. | +| Maintenance Burden | 3 | Adds concurrency complexity. Harder to debug. | +| **Net Score** | **2.70** | | + +**Verdict: DEFER.** Worth pursuing after Tier 1 fixes reduce the baseline. The concurrency analyst confirmed tree-sitter releases the GIL during parsing, so ThreadPoolExecutor (not ProcessPoolExecutor) is the preferred approach, with lower overhead. But this requires the three-pass architecture to be restructured. + +--- + +## Tier 3: REJECTED (Net gain does not justify complexity) + +### Rank 9: Rust FunctionRegistryTrie (PyO3, standalone) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 2 | Standalone: 1.5x to 3x on trie ops. Python call resolution code still creates strings for every lookup key. FFI crossing per-lookup cuts gains in half. | +| Memory Improvement | 4 | Contiguous memory layout eliminates per-node dict overhead. | +| Integration Feasibility | 2 | Only viable bundled with Rank 7 (Rust AST extension). Standalone, FFI overhead negates gains. | +| Risk & Complexity | 3 | Moderate if bundled. High coupling with Rank 7. | +| Scope of Impact | 2 | **Rank 1 (fix `find_ending_with`) eliminates the primary trie bottleneck.** After that fix, trie operations are no longer the dominant cost. | +| Maintenance Burden | 2 | Requires Rust maintenance alongside Python trie. | +| **Net Score** | **2.30** | | + +**Verdict: REJECT standalone. BUNDLE with Rank 7 if/when Rank 7 proceeds.** The critical insight from the integration architect: standalone Rust trie has negative net gains because FFI boundary crossing happens per-lookup (thousands of times per file). Only viable when bundled with the full Rust AST extension. Furthermore, Rank 1 (Python bugfix) eliminates the primary trie bottleneck (the linear scan), making Rust trie less urgent. + +--- + +### Rank 10: neo4j-rust-ext + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 1 | **0x. This codebase uses Memgraph via pymgclient, NOT the Neo4j Python driver.** neo4j-rust-ext patches the `neo4j` driver which is not used. | +| Memory Improvement | 1 | N/A. | +| Integration Feasibility | 1 | Inapplicable. No `neo4j` dependency in `pyproject.toml`. | +| Risk & Complexity | 1 | Wrong driver assumption. | +| Scope of Impact | 1 | Zero impact. | +| Maintenance Burden | 1 | N/A. | +| **Net Score** | **1.00** | | + +**Verdict: REJECT.** The language researcher incorrectly assumed the codebase uses the Neo4j Python driver. It uses Memgraph via pymgclient (a C extension). neo4j-rust-ext has zero applicability. + +--- + +### Rank 11: BLAKE3 Hashing + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 1 | Negligible. Hashing is NOT a bottleneck. `_hash_file` processes ~5ms total for 1000 files. `_content_hash` takes microseconds per call. hashlib SHA256 is already C-backed. | +| Memory Improvement | 1 | No meaningful change. | +| Integration Feasibility | 5 | One-line change per call site. Drop-in. | +| Risk & Complexity | 3 | Cache invalidation forces full re-index on first run after change. One-time negative impact dwarfs per-operation savings. | +| Scope of Impact | 1 | Hashing is <0.1% of total runtime. | +| Maintenance Burden | 4 | Minimal. | +| **Net Score** | **1.85** | | + +**Verdict: REJECT.** Optimizing an operation that takes microseconds per call provides no meaningful improvement. The cache invalidation cost (forced full re-index) creates a one-time penalty that exceeds months of per-operation savings. The integration architect's analysis is correct: "Skip unless profiling proves hashing is >5% of total wall clock time." It is far below 5%. + +--- + +### Rank 12: String Processing in Call Resolution (Rust, standalone) + +| Dimension | Score | Rationale | +|---|---|---| +| Performance Gain | 1 | **Negative standalone.** FFI overhead of passing import maps and trie state for each call resolution exceeds the savings from faster string processing. | +| Memory Improvement | 3 | Would reduce temporary string allocations. | +| Integration Feasibility | 1 | Deeply interleaved with trie lookups, import maps, AST node access. Cannot be isolated without massive FFI overhead. | +| Risk & Complexity | 1 | Requires marshalling all context across FFI per call. | +| Scope of Impact | 2 | Affects call resolution, but FFI boundary negates gains. | +| Maintenance Burden | 2 | Additional Rust code for marginal or negative benefit. | +| **Net Score** | **1.40** | | + +**Verdict: REJECT standalone. BUNDLE with Rank 7 only.** The integration architect proved that the boundary crossing cost exceeds per-operation savings when implemented standalone. Only viable as part of a comprehensive Rust AST extension (Rank 7). + +--- + +## Combined Impact Projection + +### Phase 1: Tier 1 Python Fixes (Ranks 1 through 6) + +| Fix | Time Saved | % of Total | Cumulative | +|-----|-----------|------------|------------| +| Rank 1: Fix find_ending_with | ~13.5s | 43.3% | 43.3% | +| Rank 2: String path ops | ~4.0s | 12.8% | 56.1% | +| Rank 3: Cache type inference | ~2.0s | 6.4% | 62.5% | +| Rank 4: Suppress debug logging | ~1.7s | 5.5% | 68.0% | +| Rank 5: Deduplicate FS traversal | ~1.5s | 4.8% | 72.8% | +| Rank 6: orjson (I/O workflows) | Variable | Marginal on indexing | 72.8%+ | +| **Total** | **~22.7s** | **72.8%** | | + +**Projected runtime after Phase 1:** ~8.5s (3.7x speedup from pure Python fixes) +**Integration overhead:** Zero +**Build system changes:** One dependency added (orjson) +**Maintenance burden:** None beyond standard Python + +### Phase 2: Tier 2 (Only if needed after Phase 1) + +After Phase 1, the remaining 8.5s breaks down as: +- Tree-sitter operations: ~1.0s (11.8%) +- Call resolution: ~2.5s (29.4%) +- Graph construction: ~2.5s (29.4%) +- File I/O + hashing: ~0.5s (5.9%) +- Miscellaneous: ~2.0s (23.5%) + +The Rust AST extension (Rank 7) would save ~0.94s from tree-sitter, reducing to ~7.6s (1.12x). File parallelism (Rank 8) could provide 1.5x to 3x on top. Combined: ~3.0 to 5.0s total. + +**Phase 2 is only justified when repository sizes exceed 5,000+ files**, where tree-sitter and call resolution become a proportionally larger fraction of total runtime. + +--- + +## Key Findings + +1. **72.8% of the total runtime is addressable with pure Python fixes** (zero integration overhead, zero build changes, zero maintenance burden). + +2. **The headline Rust AST rewrite (10x to 16x) targets only 3.1% of actual CPU time.** Profiling data invalidated the language researcher's core assumption about where time is spent. + +3. **neo4j-rust-ext is completely inapplicable** (wrong database driver). This was a factual error in the language recommendations. + +4. **BLAKE3 hashing optimizes a non-bottleneck** (microsecond-level operations that total <0.1% of runtime). + +5. **Standalone Rust trie and string processing have negative net gains** due to per-lookup FFI boundary crossing costs that exceed the per-operation savings. + +6. **The single largest optimization (Rank 1) is a Python bugfix**, not a language rewrite. Fixing the `_simple_name_lookup` index miss rate from 80.7% to near 0% eliminates 48.3% of total CPU time. + +--- + +## Scorecard Summary + +| Rank | Candidate | Type | Net Score | Time Saved | Verdict | +|------|-----------|------|-----------|------------|---------| +| 1 | Fix `find_ending_with` | Python bugfix | 4.80 | ~13.5s (43.3%) | **PROCEED** | +| 2 | String path ops | Python refactor | 4.50 | ~4.0s (12.8%) | **PROCEED** | +| 3 | Cache type inference | Python memoization | 3.90 | ~2.0s (6.4%) | **PROCEED** | +| 4 | Suppress debug logging | Config change | 3.75 | ~1.7s (5.5%) | **PROCEED** | +| 5 | Deduplicate FS traversal | Python refactor | 3.55 | ~1.5s (4.8%) | **PROCEED** | +| 6 | orjson | Dependency swap | 3.50 | Variable | **PROCEED** | +| 7 | Rust AST extension | Rust crate | 2.35 | ~0.94s post-fixes | **DEFER** | +| 8 | File parallelism | Architecture change | 2.70 | 1.5x to 3x post-fixes | **DEFER** | +| 9 | Rust trie (standalone) | Rust (PyO3) | 2.30 | Marginal standalone | **REJECT** | +| 10 | neo4j-rust-ext | N/A | 1.00 | 0 (wrong driver) | **REJECT** | +| 11 | BLAKE3 hashing | Dependency swap | 1.85 | Negligible | **REJECT** | +| 12 | Rust string processing | Rust (standalone) | 1.40 | Negative standalone | **REJECT** | + +--- + +**Note:** Task #9 (proof-of-concept benchmarks) was still in progress when this scorecard was produced. If benchmark data reveals performance characteristics that contradict the profiling data used here, this scorecard should be revised. However, the profiling data (cProfile, 31.2s, 179M calls) is empirical and provides a strong basis for these rankings. diff --git a/PYPI_README.md b/PYPI_README.md new file mode 100644 index 000000000..a1dd20c0b --- /dev/null +++ b/PYPI_README.md @@ -0,0 +1,160 @@ +# Code-Graph-RAG + +A graph-based RAG system that parses multi-language codebases with Tree-sitter, builds knowledge graphs in Memgraph, and enables natural language querying, editing, and optimization. + +## Install + +```bash +pip install code-graph-rag +``` + +With all Tree-sitter grammars (Python, JS, TS, Rust, Go, Java, Scala, C++, Lua): + +```bash +pip install 'code-graph-rag[treesitter-full]' +``` + +With semantic code search (UniXcoder embeddings): + +```bash +pip install 'code-graph-rag[semantic]' +``` + +### Prerequisites + +- Python 3.12+ +- Docker (for Memgraph) +- `cmake` (for building pymgclient) +- `ripgrep` (`rg`) (for shell command text searching) + +## CLI Quick Start + +The package installs a `cgr` command. + +**Start Memgraph, parse a repo, and query it:** + +```bash +docker compose up -d # start Memgraph +cgr start --repo-path ./my-project \ + --update-graph --clean # parse & launch interactive chat +``` + +**Index to protobuf for offline use:** + +```bash +cgr index -o ./index-output --repo-path ./my-project +``` + +**Export knowledge graph to JSON:** + +```bash +cgr export -o graph.json +``` + +**AI-guided optimization:** + +```bash +cgr optimize python --repo-path ./my-project +``` + +**Run as an MCP server (for Claude Code):** + +```bash +cgr mcp-server +``` + +**Check your setup:** + +```bash +cgr doctor +``` + +## Python SDK + +The `cgr` package provides short imports for programmatic use. + +### Load and query an exported graph + +```python +from cgr import load_graph + +graph = load_graph("graph.json") +print(graph.summary()) + +functions = graph.find_nodes_by_label("Function") +for fn in functions[:5]: + rels = graph.get_relationships_for_node(fn.node_id) + print(f"{fn.properties['name']}: {len(rels)} relationships") +``` + +### Query Memgraph with Cypher + +```python +from cgr import MemgraphIngestor + +with MemgraphIngestor(host="localhost", port=7687) as db: + rows = db.fetch_all("MATCH (f:Function) RETURN f.name LIMIT 10") + for row in rows: + print(row) +``` + +### Generate Cypher from natural language + +```python +import asyncio +from cgr import CypherGenerator + +async def main(): + gen = CypherGenerator() + cypher = await gen.generate("Find all classes that inherit from BaseModel") + print(cypher) + +asyncio.run(main()) +``` + +### Semantic code search + +Requires the `semantic` extra. + +```python +from cgr import embed_code + +embedding = embed_code("def authenticate(user, password): ...") +print(f"Embedding dimension: {len(embedding)}") +``` + +### Configuration + +```python +from cgr import settings + +settings.set_orchestrator("openai", "gpt-4o", api_key="sk-...") +settings.set_cypher("google", "gemini-2.5-flash", api_key="your-key") +``` + +## Environment Variables + +Configure via `.env` or environment variables: + +| Variable | Default | Description | +|----------|---------|-------------| +| `MEMGRAPH_HOST` | `localhost` | Memgraph hostname | +| `MEMGRAPH_PORT` | `7687` | Memgraph port | +| `ORCHESTRATOR_PROVIDER` | | Provider: `google`, `openai`, `ollama` | +| `ORCHESTRATOR_MODEL` | | Model ID (e.g. `gpt-4o`, `gemini-2.5-pro`) | +| `ORCHESTRATOR_API_KEY` | | API key for the provider (not needed for `ollama`) | +| `CYPHER_PROVIDER` | | Provider for Cypher generation | +| `CYPHER_MODEL` | | Model ID for Cypher generation (e.g. `codellama`, `gpt-4o-mini`) | +| `CYPHER_API_KEY` | | API key for Cypher provider (not needed for `ollama`) | +| `TARGET_REPO_PATH` | `.` | Default repository path | + +## Documentation + +Full documentation, architecture details, and contribution guide: +[docs.code-graph-rag.com](https://docs.code-graph-rag.com) + +## License + +MIT + + diff --git a/README.md b/README.md index 5ef87d4e0..77496e49c 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,11 @@ GitHub forks - - License + + Codecov + + + Quality Gate Status MseeP.ai Security Assessment @@ -21,6 +24,15 @@ Enterprise Support + + PyPI Downloads + + + OpenSSF Scorecard + + + gitcgr +

@@ -35,8 +47,9 @@ An accurate Retrieval-Augmented Generation (RAG) system that analyzes multi-lang ## Latest News 🔥 -- **[NEW]** **MCP Server Integration**: Code-Graph-RAG now works as an MCP server with Claude Code! Query and edit your codebase using natural language directly from Claude Code. [Setup Guide](docs/claude-code-setup.md) -- [2025/10/21] **Semantic Code Search**: Added intent-based code search using UniXcoder embeddings. Find functions by describing what they do (e.g., "error handling functions", "authentication code") rather than by exact names. +- **PHP Language Support**: Full PHP language support added — classes, interfaces, traits, enums, namespaces, PHP 8 attributes, and call graph analysis. Contributed by [@rs-ipps](https://github.com/rs-ipps). +- **C Language Support**: Full C language support added — functions, structs, unions, enums, preprocessor includes, and call graph analysis. Contributed by [@dj0nes](https://github.com/dj0nes). +- **Visualise any GitHub repo instantly!** Just change `github.com` to `gitcgr.com` in any repo URL — that's it, only 3 letters! Get an interactive graph of the entire codebase structure. Try it now: [gitcgr.com](https://gitcgr.com) ## 🚀 Features @@ -45,16 +58,17 @@ An accurate Retrieval-Augmented Generation (RAG) system that analyzes multi-lang | Language | Status | Extensions | Functions | Classes/Structs | Modules | Package Detection | Additional Features | |--------|------|----------|---------|---------------|-------|-----------------|-------------------| +| C | Fully Supported | .c | ✓ | ✓ | ✓ | ✓ | Functions, structs, unions, enums, preprocessor includes | | C++ | Fully Supported | .cpp, .h, .hpp, .cc, .cxx, .hxx, .hh, .ixx, .cppm, .ccm | ✓ | ✓ | ✓ | ✓ | Constructors, destructors, operator overloading, templates, lambdas, C++20 modules, namespaces | | Java | Fully Supported | .java | ✓ | ✓ | ✓ | - | Generics, annotations, modern features (records/sealed classes), concurrency, reflection | | JavaScript | Fully Supported | .js, .jsx | ✓ | ✓ | ✓ | - | ES6 modules, CommonJS, prototype methods, object methods, arrow functions | | Lua | Fully Supported | .lua | ✓ | - | ✓ | - | Local/global functions, metatables, closures, coroutines | +| PHP | Fully Supported | .php | ✓ | ✓ | ✓ | - | Classes, interfaces, traits, enums, namespaces, PHP 8 attributes | | Python | Fully Supported | .py | ✓ | ✓ | ✓ | ✓ | Type inference, decorators, nested functions | | Rust | Fully Supported | .rs | ✓ | ✓ | ✓ | ✓ | impl blocks, associated functions | | TypeScript | Fully Supported | .ts, .tsx | ✓ | ✓ | ✓ | - | Interfaces, type aliases, enums, namespaces, ES6/CommonJS modules | | C# | In Development | .cs | ✓ | ✓ | ✓ | - | Classes, interfaces, generics (planned) | | Go | In Development | .go | ✓ | ✓ | ✓ | - | Methods, type declarations | -| PHP | In Development | .php | ✓ | ✓ | ✓ | - | Classes, functions, namespaces | | Scala | In Development | .scala, .sc | ✓ | ✓ | ✓ | - | Case classes, objects | - **🌳 Tree-sitter Parsing**: Uses Tree-sitter for robust, language-agnostic AST parsing @@ -218,9 +232,20 @@ ollama pull llama3.2 4. **Start Memgraph database**: ```bash -docker-compose up -d +docker compose up -d ``` +5. **Verify installation**: +```bash +# If installed from PyPI: +cgr --help + +# If running from source: +uv run cgr --help +``` + +> **Note**: When running from source (cloned repo), prefix all `cgr` commands below with `uv run`, e.g., `uv run cgr start ...` + ## 🛠️ Makefile Commands Use the Makefile for common development tasks: @@ -284,12 +309,23 @@ The system automatically detects and processes files for all supported languages ### Step 2: Query the Codebase +**Interactive mode:** + Start the interactive RAG CLI: ```bash cgr start --repo-path /path/to/your/repo ``` +**Non-interactive mode (single query):** + +Run a single query and exit, with output sent to stdout (useful for scripting): + +```bash +python -m codebase_rag.main start --repo-path /path/to/your/repo \ + --ask-agent "What functions call UserService.create_user?" +``` + ### Step 2.5: Real-Time Graph Updates (Optional) For active development, you can keep your knowledge graph automatically synchronized with code changes using the realtime updater. This is particularly useful when you're actively modifying code and want the AI assistant to always work with the latest codebase structure. @@ -454,7 +490,7 @@ cgr optimize javascript --repo-path /path/to/frontend \ ``` **Supported Languages for Optimization:** -All supported languages: `python`, `javascript`, `typescript`, `rust`, `go`, `java`, `scala`, `cpp` +All supported languages: `python`, `javascript`, `typescript`, `rust`, `go`, `java`, `scala`, `c`, `cpp` **How It Works:** 1. **Analysis Phase**: The agent analyzes your codebase structure using the knowledge graph @@ -532,13 +568,16 @@ claude mcp add --transport stdio code-graph-rag \ | `list_projects` | List all indexed projects in the knowledge graph database. Returns a list of project names that have been indexed. | | `delete_project` | Delete a specific project from the knowledge graph database. This removes all nodes associated with the project while preserving other projects. Use list_projects first to see available projects. | | `wipe_database` | WARNING: Completely wipe the entire database, removing ALL indexed projects. This cannot be undone. Use delete_project for removing individual projects. | -| `index_repository` | Parse and ingest the repository into the Memgraph knowledge graph. This builds a comprehensive graph of functions, classes, dependencies, and relationships. Note: This preserves other projects - only the current project is re-indexed. | -| `query_code_graph` | Query the codebase knowledge graph using natural language. Ask questions like 'What functions call UserService.create_user?' or 'Show me all classes that implement the Repository interface'. | +| `index_repository` | WARNING: Clears all data for the current project including its embeddings. Parse and ingest the repository into the Memgraph knowledge graph. Use update_repository for incremental updates. Only use when explicitly requested. | +| `update_repository` | Update the repository in the Memgraph knowledge graph without clearing existing data. Use this for incremental updates. | +| `query_code_graph` | Query the codebase knowledge graph using natural language. Use semantic_search unless you know the exact names of classes/functions you are searching for. Ask questions like 'What functions call UserService.create_user?' or 'Show me all classes that implement the Repository interface'. | | `get_code_snippet` | Retrieve source code for a function, class, or method by its qualified name. Returns the source code, file path, line numbers, and docstring. | | `surgical_replace_code` | Surgically replace an exact code block in a file using diff-match-patch. Only modifies the exact target block, leaving the rest unchanged. | | `read_file` | Read the contents of a file from the project. Supports pagination for large files. | | `write_file` | Write content to a file, creating it if it doesn't exist. | | `list_directory` | List contents of a directory in the project. | +| `semantic_search` | Performs a semantic search for functions based on a natural language query describing their purpose, returning a list of potential matches with similarity scores. Requires the 'semantic' extra to be installed. | +| `ask_agent` | Ask the RAG agent a question about the codebase. Wraps the full RAG pipeline (graph query, LLM response) as an MCP tool. | ### Example Usage @@ -561,35 +600,36 @@ The knowledge graph uses the following node types and relationships: | Label | Properties | |-----|----------| | Project | `{name: string}` | -| Package | `{qualified_name: string, name: string, path: string}` | -| Folder | `{path: string, name: string}` | -| File | `{path: string, name: string, extension: string}` | -| Module | `{qualified_name: string, name: string, path: string}` | -| Class | `{qualified_name: string, name: string, decorators: list[string]}` | -| Function | `{qualified_name: string, name: string, decorators: list[string]}` | -| Method | `{qualified_name: string, name: string, decorators: list[string]}` | -| Interface | `{qualified_name: string, name: string}` | -| Enum | `{qualified_name: string, name: string}` | +| Package | `{qualified_name: string, name: string, path: string, absolute_path: string}` | +| Folder | `{path: string, name: string, absolute_path: string}` | +| File | `{path: string, name: string, extension: string, absolute_path: string}` | +| Module | `{qualified_name: string, name: string, path: string, absolute_path: string}` | +| Class | `{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}` | +| Function | `{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}` | +| Method | `{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}` | +| Interface | `{qualified_name: string, name: string, path: string, absolute_path: string}` | +| Enum | `{qualified_name: string, name: string, path: string, absolute_path: string}` | | Type | `{qualified_name: string, name: string}` | | Union | `{qualified_name: string, name: string}` | -| ModuleInterface | `{qualified_name: string, name: string, path: string}` | -| ModuleImplementation | `{qualified_name: string, name: string, path: string, implements_module: string}` | +| ModuleInterface | `{qualified_name: string, name: string, path: string, absolute_path: string}` | +| ModuleImplementation | `{qualified_name: string, name: string, path: string, absolute_path: string, implements_module: string}` | | ExternalPackage | `{name: string, version_spec: string}` | ### Language-Specific Mappings +- **C**: `enum_specifier`, `function_definition`, `struct_specifier`, `union_specifier` - **C++**: `class_specifier`, `declaration`, `enum_specifier`, `field_declaration`, `function_definition`, `lambda_expression`, `struct_specifier`, `template_declaration`, `union_specifier` - **Java**: `annotation_type_declaration`, `class_declaration`, `constructor_declaration`, `enum_declaration`, `interface_declaration`, `method_declaration`, `record_declaration` - **JavaScript**: `arrow_function`, `class`, `class_declaration`, `function_declaration`, `function_expression`, `generator_function_declaration`, `method_definition` - **Lua**: `function_declaration`, `function_definition` +- **PHP**: `anonymous_function`, `arrow_function`, `class_declaration`, `enum_declaration`, `function_definition`, `interface_declaration`, `method_declaration`, `trait_declaration` - **Python**: `class_definition`, `function_definition` - **Rust**: `closure_expression`, `enum_item`, `function_item`, `function_signature_item`, `impl_item`, `struct_item`, `trait_item`, `type_item`, `union_item` - **TypeScript**: `abstract_class_declaration`, `arrow_function`, `class`, `class_declaration`, `enum_declaration`, `function_declaration`, `function_expression`, `function_signature`, `generator_function_declaration`, `interface_declaration`, `internal_module`, `method_definition`, `type_alias_declaration` - **C#**: `anonymous_method_expression`, `class_declaration`, `constructor_declaration`, `destructor_declaration`, `enum_declaration`, `function_pointer_type`, `interface_declaration`, `lambda_expression`, `local_function_statement`, `method_declaration`, `struct_declaration` - **Go**: `function_declaration`, `method_declaration`, `type_declaration` -- **PHP**: `anonymous_function`, `arrow_function`, `class_declaration`, `enum_declaration`, `function_definition`, `function_static_declaration`, `interface_declaration`, `trait_declaration` - **Scala**: `class_definition`, `function_declaration`, `function_definition`, `object_definition`, `trait_definition` @@ -679,6 +719,7 @@ my_build_output - **pydantic-settings**: Settings management using Pydantic - **pymgclient**: Memgraph database adapter for Python language - **python-dotenv**: Read key-value pairs from a .env file and set them as environment variables +- **tiktoken**: Fast BPE tokeniser used for token counting and context window management - **toml**: Python Library for Tom's Obvious, Minimal Language - **tree-sitter-python**: Python grammar for tree-sitter - **tree-sitter**: Python bindings to the Tree-sitter parsing library @@ -887,3 +928,7 @@ We also offer custom development, integration consulting, technical support cont ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=vitali87/code-graph-rag&type=Date)](https://www.star-history.com/#vitali87/code-graph-rag&Date) + +## Fork History + +[![Fork History Chart](https://fork-history.site/svg?repos=vitali87/code-graph-rag)](https://fork-history.site/#vitali87/code-graph-rag) diff --git a/REWRITE_RECOMMENDATIONS.md b/REWRITE_RECOMMENDATIONS.md new file mode 100644 index 000000000..ebd649eda --- /dev/null +++ b/REWRITE_RECOMMENDATIONS.md @@ -0,0 +1,340 @@ +# Rewrite Recommendations: code-graph-rag Performance Optimization + +## Executive Summary + +A comprehensive performance analysis of the code-graph-rag codebase (31.2s total, 179M function calls indexing 352 Python files) reveals that **no language rewrite is currently justified**. The top performance bottlenecks are algorithmic inefficiencies and unnecessary object creation in pure Python code, addressable with zero new dependencies and zero build system changes. + +### Top 3 Recommendations + +1. **Fix `find_ending_with` suffix index** (Python bugfix): Eliminates 48.3% of total CPU time. The `_simple_name_lookup` index has an 80.7% miss rate, causing 123.7M `str.endswith()` calls via linear scan fallback. Benchmarked fix: **261x to 382x speedup** on the operation. Projected total speedup: ~1.9x. + +2. **Replace pathlib with string operations in `should_skip_path`** (Python refactor): Eliminates 13.7% of total CPU time. `pathlib.relative_to()` creates intermediate objects on every call (59,012 calls, 3.39s total). Benchmarked fix: **45x to 634x speedup** on path operations. Projected total speedup: ~1.15x. + +3. **Cache `build_local_variable_type_map` results** (Python memoization): Eliminates 8.3% of total CPU time. 5,228 uncached AST traversals. Projected total speedup: ~1.07x. + +**Combined Tier 1 impact:** ~3.7x total speedup (31.2s to ~8.5s) from pure Python fixes with zero integration overhead. + +### Key Finding: Rust Rewrite Not Justified + +The language researcher's headline recommendation (Rust AST extension for "10x to 16x speedup") targets tree-sitter operations that consume only **3.1% of actual CPU time**. After Tier 1 Python fixes, a 16x Rust speedup on tree-sitter would yield only **1.03x total improvement** (Amdahl's law). The high development cost (~110KB of Python to port, multi-language parser support, Rust toolchain in CI/Docker) and maintenance burden make this poor ROI until repository sizes exceed 5,000+ files. + +### Adversarial Review Outcome + +The adversarial reviewer confirmed that **no language rewrite candidate survives challenge**. All top hotspots are fixable in Python. The Rust AST extension was the only candidate with theoretical merit, but the measured 3.1% CPU share makes it unjustifiable at current scale. + +### Security Audit Outcome + +The security auditor approved all recommended candidates with zero disputes. The only new dependency (orjson) is a widely adopted, well-maintained package with pre-built wheels. + +--- + +## Profiling Baseline + +| Metric | Value | +|--------|-------| +| Profiling tool | cProfile | +| Total runtime | 31.2 seconds | +| Total function calls | 179M | +| Workload | `GraphUpdater.run(force=True)` indexing 352 Python files | +| Platform | macOS Darwin 25.3.0, ARM64 | +| Python version | 3.12.2 (CPython) | +| Key dependencies | tree-sitter 0.25.2, pymgclient, loguru, torch 2.10 | + +--- + +## Detailed Analysis: Accepted Candidates + +### Candidate 1: Fix `find_ending_with` Linear Scan + +**Priority:** 1 (Highest) +**Type:** Python bugfix +**Effort:** Low +**Files:** `codebase_rag/graph_updater.py:156-161` + +**Profiling Data:** +- Self time: 7.91s (25.3%) +- Cumulative time: 15.07s (48.3%) +- Call count: 27,376 calls +- Root cause: `_simple_name_lookup` index miss rate of 80.7% (22,096 of 27,376 calls) +- Fallback: `[qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")]` generating 123.7M `str.endswith()` invocations + +**Benchmark Results:** + +| Registry Size | Queries | Linear Scan (ms) | Suffix Index (ms) | Speedup | +|---|---|---|---|---| +| 1,000 | 38 | 1.77 | 0.007 | 261x | +| 4,500 | 38 | 8.04 | 0.023 | 356x | +| 10,000 | 38 | 17.78 | 0.046 | 382x | + +**Fix:** Populate `_simple_name_lookup` for every insert path, including `__setitem__`. Build a complete suffix index mapping the last dot-separated segment to the full qualified name set. This converts O(n) scans to O(1) lookups. + +**Projected Net Gain:** ~1.9x total speedup (13.5s saved) +**Integration Overhead:** Zero +**Risk:** Very low + +--- + +### Candidate 2: Replace pathlib with String Operations + +**Priority:** 2 +**Type:** Python refactor +**Effort:** Low +**Files:** `codebase_rag/utils/path_utils.py`, `codebase_rag/graph_updater.py:364-388` + +**Profiling Data:** +- Cumulative time: 4.29s (13.7%) +- Call count: 59,270 calls +- Root cause: `pathlib.relative_to()` creates intermediate `PurePosixPath` objects (3.39s across 54,519 calls) + +**Benchmark Results:** + +| Operation | pathlib (ms) | String ops (ms) | Speedup | +|---|---|---|---| +| `relative_to` vs `removeprefix` (5K paths) | 61.3 | 0.097 | 634x | +| Full `should_skip_path` (5K paths) | 69.3 | 1.55 | 45x | +| Full `should_skip_path` (20K paths) | 285.9 | 6.21 | 46x | + +**Fix:** Convert paths to strings at the function boundary. Use `str.removeprefix()` and `str.split("/")` instead of `Path.relative_to()` and `Path.parts`. + +**Projected Net Gain:** ~1.15x total speedup (4.0s saved) +**Integration Overhead:** Zero +**Risk:** Very low + +--- + +### Candidate 3: Cache Type Inference Results + +**Priority:** 3 +**Type:** Python memoization +**Effort:** Low +**Files:** `codebase_rag/parsers/type_inference.py:119` + +**Profiling Data:** +- Cumulative time: 2.59s (8.3%) +- Call count: 5,228 calls +- Root cause: Re-traverses AST nodes per function for type inference without caching + +**Fix:** Memoize results keyed by `(file_path, function_start_line, function_end_line)`. Cache invalidation handled by existing incremental update system. + +**Projected Net Gain:** ~1.07x total speedup (2.0s saved) +**Integration Overhead:** ~2MB memory for cache +**Risk:** Low + +--- + +### Candidate 4: Suppress Debug Logging in Production + +**Priority:** 4 +**Type:** Configuration change +**Effort:** Trivial +**Files:** `codebase_rag/graph_updater.py` (run method) + +**Profiling Data:** +- Cumulative time: 1.84s (5.9%) +- Call count: 91,119 calls (85,099 debug-level) +- Root cause: Debug log calls processed even when output is suppressed + +**Fix:** Set loguru level to INFO at the start of `GraphUpdater.run()`, or use `logger.opt(lazy=True).debug()` for expensive format strings. + +**Projected Net Gain:** ~1.06x total speedup (1.7s saved) +**Integration Overhead:** Zero +**Risk:** Very low + +--- + +### Candidate 5: Deduplicate Filesystem Traversal + +**Priority:** 5 +**Type:** Python refactor +**Effort:** Low +**Files:** `codebase_rag/graph_updater.py:364`, `codebase_rag/parsers/structure_processor.py:49` + +**Profiling Data:** +- `identify_structure()`: 1.57s (5.0%) +- `_collect_eligible_files()`: 4.71s (15.1%, overlapping with Candidate 2) +- Root cause: Both call `rglob("*")` + `should_skip_path()` independently + +**Fix:** Merge into a single traversal pass that collects both structural elements and eligible files. + +**Projected Net Gain:** ~1.05x total speedup (1.5s saved) +**Integration Overhead:** Moderate refactor of two-pass architecture +**Risk:** Low + +--- + +### Candidate 6: orjson for JSON Serialization + +**Priority:** 6 +**Type:** Dependency swap +**Effort:** Trivial +**Files:** All files using `import json` (graph_loader.py, graph_updater.py, embedder.py, services/graph_service.py) + +**Benchmark Results:** + +| Operation | json (ms) | orjson (ms) | Speedup | +|---|---|---|---| +| Compact dumps (1.9 MB) | 5.73 | 1.01 | 5.7x | +| Indented dumps (1.9 MB) | 48.5 | 2.02 | 24.0x | +| Loads (1.9 MB) | 6.23 | 3.24 | 1.9x | + +**Fix:** Add `orjson>=3.10.0` to dependencies. Replace `json.dumps()` with `orjson.dumps()` (~10 call sites, minor API adjustment for bytes vs str return type). + +**Projected Net Gain:** 5.4x to 25x on JSON operations. Marginal impact on indexing (JSON is not a dominant hotspot), significant impact on graph export/import. +**Integration Overhead:** Near zero +**Security:** Widely adopted (polars, FastAPI). Pre-built wheels. Approved by security audit. +**Risk:** Very low + +--- + +## Combined Impact Projection + +| Phase | Fixes | Time Saved | Cumulative Speedup | Overhead | +|-------|-------|-----------|-------------------|----------| +| Tier 1 | Candidates 1 through 6 | ~22.7s | ~3.7x (31.2s to ~8.5s) | Zero (except orjson dep) | + +**Post Tier 1 runtime breakdown (projected ~8.5s):** + +| Component | Time | % of Reduced Total | +|-----------|------|--------------------| +| Call resolution | ~2.5s | 29.4% | +| Graph construction | ~2.5s | 29.4% | +| Miscellaneous | ~2.0s | 23.5% | +| Tree-sitter operations | ~1.0s | 11.8% | +| File I/O + hashing | ~0.5s | 5.9% | + +--- + +## Deferred Candidates + +### Rust AST Processing Extension (PyO3/maturin) + +**Status:** DEFERRED (reconsider at 5,000+ file scale) + +**Rationale:** Tree-sitter operations consume 3.1% of CPU (0.97s). After Tier 1 fixes, this becomes 11.8% of the reduced 8.5s runtime. A 16x Rust speedup saves 0.94s, yielding 1.12x total improvement. + +**Why deferred, not rejected:** +- At 5,000+ file scale, tree-sitter time scales linearly while Python fix savings are largely constant +- The structural overhead per node visit (20x to 50x) is real but only matters when visit count is high enough +- Rust extension would also unlock GIL-free thread parallelism for file processing + +**Cost if pursued:** ~110KB of Python code to port, 8+ language parsers, maturin build system, Rust toolchain in CI/Docker, platform-specific wheels, ongoing Rust maintenance + +### File Processing Parallelism + +**Status:** DEFERRED (pursue after Tier 1 fixes) + +**Rationale:** Tree-sitter releases the GIL during parsing, enabling ThreadPoolExecutor parallelism. However, shared mutable state (`FunctionRegistryTrie`, `import_mapping`) requires architectural restructuring. The three-pass architecture (structure, definitions, calls) has inherent sequential dependencies. + +**Projected gain:** 1.5x to 3x after Tier 1 fixes +**Prerequisite:** Tier 1 fixes must be applied first to establish the new performance baseline + +--- + +## Rejected Candidates + +### neo4j-rust-ext + +**Verdict:** REJECTED (inapplicable) +**Reason:** This codebase uses Memgraph via `pymgclient` (C extension), not the Neo4j Python driver. `neo4j-rust-ext` patches the `neo4j` driver which is not a dependency. The language researcher's recommendation was based on an incorrect assumption about the database driver. + +### BLAKE3 Hashing + +**Verdict:** REJECTED (invalidated by benchmarks) + +**Benchmark Results:** + +| Operation | SHA256 (ms) | BLAKE3 (ms) | Speedup | +|---|---|---|---| +| 500 snippet hashes | 0.155 | 0.325 | 0.5x (slower) | +| 2,000 snippet hashes | 0.594 | 1.177 | 0.5x (slower) | +| 50 file hashes (5KB avg) | 0.968 | 1.031 | 0.9x (slower) | + +**Reason:** The language recommendations projected 4x to 10x speedup based on algorithmic benchmarks, not Python binding benchmarks. hashlib SHA256 is already C-backed (OpenSSL). BLAKE3's SIMD advantages require large contiguous buffers; code snippets average 200 bytes. FFI overhead per call exceeds algorithmic savings for small inputs. Additionally, hashing is <0.1% of total runtime. + +### Rust FunctionRegistryTrie (Standalone) + +**Verdict:** REJECTED +**Reason:** Standalone Rust trie provides only 1.5x to 3x net gain after FFI overhead. The FFI boundary is crossed per-lookup (thousands of times per file), cutting gains roughly in half. More critically, the Python suffix index fix (Candidate 1) provides 261x to 382x speedup on the actual bottleneck, making the Rust trie unnecessary. Only viable if bundled with a full Rust AST extension. + +### Rust String Processing in Call Resolution (Standalone) + +**Verdict:** REJECTED +**Reason:** Negative net gains when implemented standalone. Call resolution is deeply interleaved with trie lookups, import map lookups, and AST node access. Extracting just the string processing would require marshalling all context (import maps, trie state, class inheritance) across FFI on every call, which exceeds the per-operation savings. + +--- + +## Optimize-First Recommendations (Non-Rewrite) + +These Python-level improvements should be implemented before any language rewrite consideration: + +1. **Use `embed_code_batch`** in `graph_updater.py:_generate_semantic_embeddings`: The batch function exists but the pipeline calls `embed_code` per item. Projected 5x to 20x speedup on the embedding phase. + +2. **Incremental call re-resolution** in `realtime_updater.py`: Currently performs full call re-resolution on every file change. Implementing incremental resolution (re-resolve only affected qualified names) would provide 10x to 100x speedup for realtime updates. + +3. **Fix BoundedASTCache memory limit**: `sys.getsizeof()` misses C-level tree-sitter memory, so the cache size limit is effectively broken. Use `tracemalloc` or a conservative estimate based on entry count instead. + +4. **EmbeddingCache data format**: Replace `list[float]` with numpy arrays for 4x memory reduction on embedding storage. + +5. **FunctionRegistryTrie dual storage**: Consolidate `_entries` dict and trie nodes to eliminate 2.5 MiB waste per 10K entries (addressable as part of Candidate 1). + +--- + +## Benchmark Methodology + +**Infrastructure:** Established by test-sentinel (task #1). All benchmarks in `benchmarks/` directory. + +| Parameter | Value | +|-----------|-------| +| Warmup runs | 3 (discarded) | +| Measured iterations | 20 to 100 per benchmark | +| Statistics | Median, mean, stddev, min, max, p95 | +| GC | Disabled during timing | +| Isolation | Fresh function scope per run | + +**Benchmark suite:** + +| File | Target | +|------|--------| +| `bench_find_ending_with_fix.py` | Suffix index vs linear scan | +| `bench_pathlib_vs_string.py` | pathlib vs string path operations | +| `bench_json_serialization.py` | stdlib json vs orjson | +| `bench_file_hashing.py` | SHA256 vs BLAKE3 vs BLAKE2b | +| `bench_trie.py` | FunctionRegistryTrie operations | +| `bench_string_ops.py` | String operation microbenchmarks | +| `bench_embedding_cache.py` | EmbeddingCache operations | +| `bench_ast_cache.py` | BoundedASTCache operations | +| `bench_graph_loader.py` | GraphLoader JSON parse + index build | +| `bench_dropin_replacements.py` | Drop-in library comparisons | + +Run all benchmarks: `uv run python benchmarks/run_all.py` + +--- + +## Profiling Data Sources + +| Phase | Task | Owner | Output | +|-------|------|-------|--------| +| Baseline | #1 | test-sentinel | Green test suite, benchmark methodology | +| CPU profiling | #2 | cpu-profiler | Hotspot report (cProfile, 31.2s, 179M calls) | +| Memory profiling | #3 | memory-profiler | Allocation report (tracemalloc, 25-frame traces) | +| I/O profiling | #4 | cpu-profiler | I/O report | +| Concurrency analysis | #5 | concurrency-analyst | GIL analysis, parallelism opportunities, scaling factors | +| Structural analysis | #6 | static-pattern-analyst | 9 language-inherent ceilings with severity rankings | +| Language research | #7 | language-researcher | Target language recommendations (Rust via PyO3) | +| Integration feasibility | #8 | integration-architect | FFI overhead analysis, build system impact, net gain calculations | +| Benchmarks | #9 | benchmark-designer | Measured performance for all candidates | +| Scorecard | #10 | evaluator | Prioritized ranking with scores | +| Adversarial review | #11 | adversarial-reviewer | No rewrite justified at current scale | +| Security audit | #12 | security-auditor | All candidates approved, zero disputes | + +--- + +## Conclusion + +The performance analysis produced a clear, data-driven result: **optimize Python first, rewrite later (if ever).** + +The top 5 bottlenecks consuming 72.8% of runtime are all pure Python algorithmic issues (linear scan fallback, pathlib object overhead, uncached traversals, debug logging, duplicate traversals). Fixing them provides ~3.7x total speedup with zero integration overhead, zero build system changes, and zero maintenance burden. + +The Rust AST extension, while technically sound as a future optimization for large-scale workloads, targets only 3.1% of current CPU time and provides ~1.03x total improvement after Python fixes. It should be reconsidered only when the codebase routinely processes 5,000+ file repositories and the Python fixes have been applied. + +No language rewrite recommendation survived the adversarial review at current scale. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..8bf17426b --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,46 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +| ------- | ------------------ | +| 0.0.x | :white_check_mark: | + +As the project is in early development (pre 1.0), only the latest release receives security updates. Please ensure you are running the most recent version before reporting a vulnerability. + +## Reporting a Vulnerability + +**Please do not report security vulnerabilities through public GitHub issues, pull requests, or discussions.** + +Instead, please use [GitHub Private Vulnerability Reporting](https://github.com/vitali87/code-graph-rag/security/advisories/new) to submit your report. This ensures the details remain confidential until a fix is available. + +When reporting, please include: + +- A description of the vulnerability and its potential impact +- Steps to reproduce or a proof of concept +- The version(s) affected +- Any suggested fix, if available + +## What to Expect + +- **Acknowledgement** within 72 hours of your report +- **Status update** within 7 days with an initial assessment +- **Resolution target** of 30 days for confirmed vulnerabilities, though critical issues will be prioritized for faster turnaround + +If the vulnerability is accepted, we will work on a fix, coordinate disclosure with you, and credit you in the release notes (unless you prefer to remain anonymous). + +If the vulnerability is declined, we will provide a clear explanation of why. + +## Scope + +This policy applies to the `code-graph-rag` Python package and its official repository. Third party dependencies are outside the direct scope of this policy, though we use Dependabot to monitor and update them. + +## Security Measures in This Project + +- **Dependency scanning**: Dependabot is enabled for automated dependency updates +- **Secret scanning**: GitHub secret scanning is active on this repository +- **Branch protection**: The `main` branch requires pull request reviews before merging + +## Preferred Languages + +We accept security reports in English. diff --git a/benchmarks/bench_ast_cache.py b/benchmarks/bench_ast_cache.py new file mode 100644 index 000000000..b1e3e65d9 --- /dev/null +++ b/benchmarks/bench_ast_cache.py @@ -0,0 +1,134 @@ +import statistics +import sys +import time +from collections import OrderedDict +from pathlib import Path + +WARMUP_RUNS = 3 +BENCH_RUNS = 50 + + +class MockNode: + __slots__ = ("data",) + + def __init__(self, size: int) -> None: + self.data = b"\x00" * size + + +def bench_ordered_dict_insert(count: int, item_size: int) -> float: + start = time.perf_counter() + cache: OrderedDict[Path, tuple[MockNode, str]] = OrderedDict() + for i in range(count): + key = Path(f"/fake/path/module_{i}.py") + cache[key] = (MockNode(item_size), "python") + return time.perf_counter() - start + + +def bench_ordered_dict_lookup(cache: OrderedDict, keys: list[Path]) -> float: + start = time.perf_counter() + for key in keys: + _ = key in cache + return time.perf_counter() - start + + +def bench_ordered_dict_access_lru(cache: OrderedDict, keys: list[Path]) -> float: + start = time.perf_counter() + for key in keys: + if key in cache: + cache.move_to_end(key) + _ = cache[key] + return time.perf_counter() - start + + +def bench_ordered_dict_eviction(count: int, max_size: int, item_size: int) -> float: + start = time.perf_counter() + cache: OrderedDict[Path, tuple[MockNode, str]] = OrderedDict() + for i in range(count): + key = Path(f"/fake/path/module_{i}.py") + cache[key] = (MockNode(item_size), "python") + while len(cache) > max_size: + cache.popitem(last=False) + return time.perf_counter() - start + + +def bench_getsizeof_overhead(cache: OrderedDict) -> float: + start = time.perf_counter() + _ = sum(sys.getsizeof(v) for v in cache.values()) + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<45} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 115) + for r in results: + print( + f"{r['name']:<45} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + configs = [ + (500, 1024), + (2000, 4096), + (5000, 8192), + ] + + for count, item_size in configs: + print(f"\n{'='*115}") + print(f"BoundedASTCache Benchmark (entries={count}, item_size={item_size}B)") + print(f"{'='*115}") + + results = [] + + r = run_benchmark(f"insert ({count})", bench_ordered_dict_insert, count, item_size) + results.append(r) + + cache: OrderedDict[Path, tuple[MockNode, str]] = OrderedDict() + keys: list[Path] = [] + for i in range(count): + key = Path(f"/fake/path/module_{i}.py") + keys.append(key) + cache[key] = (MockNode(item_size), "python") + + r = run_benchmark(f"lookup ({count})", bench_ordered_dict_lookup, cache, keys) + results.append(r) + + r = run_benchmark(f"access+LRU ({count})", bench_ordered_dict_access_lru, cache, keys) + results.append(r) + + max_size = count // 2 + r = run_benchmark( + f"insert+evict (max={max_size})", + bench_ordered_dict_eviction, count, max_size, item_size, + ) + results.append(r) + + r = run_benchmark(f"getsizeof scan ({count})", bench_getsizeof_overhead, cache) + results.append(r) + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_dropin_replacements.py b/benchmarks/bench_dropin_replacements.py new file mode 100644 index 000000000..ee4eb0b0a --- /dev/null +++ b/benchmarks/bench_dropin_replacements.py @@ -0,0 +1,267 @@ +import hashlib +import json +import os +import statistics +import tempfile +import time +from pathlib import Path + +try: + import blake3 + import orjson +except ImportError as e: + print(f"SKIP bench_dropin_replacements: {e}") + print("Install with: uv pip install blake3 orjson") + raise SystemExit(0) + +WARMUP_RUNS = 3 +BENCH_RUNS = 30 + + +def generate_graph_data(num_nodes: int, num_rels: int) -> dict: + nodes = [] + for i in range(num_nodes): + nodes.append({ + "node_id": i, + "labels": ["Function" if i % 3 == 0 else "Class" if i % 3 == 1 else "Module"], + "properties": { + "qualified_name": f"project.module{i // 100}.Class{i // 10}.method{i}", + "name": f"method{i}", + "start_line": i * 10, + "end_line": i * 10 + 9, + "docstring": f"Method {i} documentation string with some content" if i % 5 == 0 else None, + "decorators": ["staticmethod"] if i % 7 == 0 else [], + "is_exported": i % 4 == 0, + }, + }) + + rels = [] + for i in range(num_rels): + rels.append({ + "from_id": i % num_nodes, + "to_id": (i * 7 + 3) % num_nodes, + "type": "CALLS" if i % 3 == 0 else "DEFINES" if i % 3 == 1 else "IMPORTS", + "properties": {"weight": i % 10} if i % 5 == 0 else {}, + }) + + return { + "nodes": nodes, + "relationships": rels, + "metadata": { + "total_nodes": num_nodes, + "total_relationships": num_rels, + "exported_at": "2026-03-14T10:00:00+00:00", + }, + } + + +def generate_snippets(count: int, avg_length: int = 200) -> list[str]: + import random + import string + random.seed(42) + snippets = [] + for _ in range(count): + length = avg_length + random.randint(-50, 50) + snippet = "".join(random.choices(string.ascii_letters + string.digits + " \n\t", k=length)) + snippets.append(snippet) + return snippets + + +def create_test_files(directory: str, count: int, avg_size_kb: int) -> list[Path]: + paths = [] + for i in range(count): + path = Path(directory) / f"file_{i}.py" + content = os.urandom(avg_size_kb * 1024) + path.write_bytes(content) + paths.append(path) + return paths + + +def bench_json_dumps(data: dict) -> float: + start = time.perf_counter() + _ = json.dumps(data) + return time.perf_counter() - start + + +def bench_orjson_dumps(data: dict) -> float: + start = time.perf_counter() + _ = orjson.dumps(data) + return time.perf_counter() - start + + +def bench_json_dumps_indent(data: dict) -> float: + start = time.perf_counter() + _ = json.dumps(data, indent=2, ensure_ascii=False) + return time.perf_counter() - start + + +def bench_orjson_dumps_indent(data: dict) -> float: + start = time.perf_counter() + _ = orjson.dumps(data, option=orjson.OPT_INDENT_2) + return time.perf_counter() - start + + +def bench_json_loads(json_bytes: bytes) -> float: + start = time.perf_counter() + _ = json.loads(json_bytes) + return time.perf_counter() - start + + +def bench_orjson_loads(json_bytes: bytes) -> float: + start = time.perf_counter() + _ = orjson.loads(json_bytes) + return time.perf_counter() - start + + +def bench_sha256_hashing(snippets: list[str]) -> float: + start = time.perf_counter() + for s in snippets: + _ = hashlib.sha256(s.encode()).hexdigest() + return time.perf_counter() - start + + +def bench_blake3_hashing(snippets: list[str]) -> float: + start = time.perf_counter() + for s in snippets: + _ = blake3.blake3(s.encode()).hexdigest() + return time.perf_counter() - start + + +def bench_sha256_file(files: list[Path]) -> float: + start = time.perf_counter() + for f in files: + hasher = hashlib.sha256() + with f.open("rb") as fh: + while chunk := fh.read(8192): + hasher.update(chunk) + _ = hasher.hexdigest() + return time.perf_counter() - start + + +def bench_blake3_file(files: list[Path]) -> float: + start = time.perf_counter() + for f in files: + hasher = blake3.blake3() + with f.open("rb") as fh: + while chunk := fh.read(8192): + hasher.update(chunk) + _ = hasher.hexdigest() + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<50} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 120) + for r in results: + print( + f"{r['name']:<50} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def print_comparison(baseline: dict[str, float], optimized: dict[str, float]) -> None: + speedup = baseline["median_ms"] / optimized["median_ms"] if optimized["median_ms"] > 0 else float("inf") + print(f" -> Speedup: {speedup:.1f}x (median)") + + +def main() -> None: + print("=" * 120) + print("DROP-IN REPLACEMENT BENCHMARKS: Python stdlib vs Rust-backed alternatives") + print("=" * 120) + + # --- JSON Serialization --- + for num_nodes, num_rels in [(1000, 2000), (5000, 10000), (20000, 50000)]: + print(f"\n{'='*120}") + print(f"JSON Serialization: stdlib json vs orjson (nodes={num_nodes}, rels={num_rels})") + print(f"{'='*120}") + + data = generate_graph_data(num_nodes, num_rels) + json_bytes = json.dumps(data).encode() + orjson_bytes = orjson.dumps(data) + print(f"Data size: {len(json_bytes) / 1024:.1f} KB") + + results = [] + + r1 = run_benchmark(f"json.dumps compact ({num_nodes}n)", bench_json_dumps, data) + results.append(r1) + r2 = run_benchmark(f"orjson.dumps compact ({num_nodes}n)", bench_orjson_dumps, data) + results.append(r2) + + r3 = run_benchmark(f"json.dumps indented ({num_nodes}n)", bench_json_dumps_indent, data) + results.append(r3) + r4 = run_benchmark(f"orjson.dumps indented ({num_nodes}n)", bench_orjson_dumps_indent, data) + results.append(r4) + + r5 = run_benchmark(f"json.loads ({num_nodes}n)", bench_json_loads, json_bytes) + results.append(r5) + r6 = run_benchmark(f"orjson.loads ({num_nodes}n)", bench_orjson_loads, orjson_bytes) + results.append(r6) + + print_results(results) + + print("\nSpeedups:") + print(f" dumps compact: {r1['median_ms'] / r2['median_ms']:.1f}x") + print(f" dumps indented: {r3['median_ms'] / r4['median_ms']:.1f}x") + print(f" loads: {r5['median_ms'] / r6['median_ms']:.1f}x") + + # --- Hashing: SHA256 vs BLAKE3 --- + print(f"\n\n{'='*120}") + print("Hashing: hashlib.sha256 vs blake3 (snippet hashing for EmbeddingCache)") + print(f"{'='*120}") + + for size in [500, 2000, 10000]: + snippets = generate_snippets(size) + print(f"\n--- Snippet count: {size} ---") + + results = [] + r1 = run_benchmark(f"hashlib.sha256 ({size} snippets)", bench_sha256_hashing, snippets) + results.append(r1) + r2 = run_benchmark(f"blake3 ({size} snippets)", bench_blake3_hashing, snippets) + results.append(r2) + + print_results(results) + print(f" Speedup: {r1['median_ms'] / r2['median_ms']:.1f}x") + + # --- File Hashing --- + print(f"\n\n{'='*120}") + print("File Hashing: SHA256 vs BLAKE3 (incremental build file change detection)") + print(f"{'='*120}") + + for file_count, avg_size_kb in [(50, 5), (200, 10), (500, 20)]: + with tempfile.TemporaryDirectory() as tmpdir: + files = create_test_files(tmpdir, file_count, avg_size_kb) + total_mb = sum(f.stat().st_size for f in files) / (1024 * 1024) + print(f"\n--- Files: {file_count}, Total: {total_mb:.1f} MB ---") + + results = [] + r1 = run_benchmark(f"sha256 ({file_count}f, {avg_size_kb}KB avg)", bench_sha256_file, files) + results.append(r1) + r2 = run_benchmark(f"blake3 ({file_count}f, {avg_size_kb}KB avg)", bench_blake3_file, files) + results.append(r2) + + print_results(results) + print(f" Speedup: {r1['median_ms'] / r2['median_ms']:.1f}x") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_embedding_cache.py b/benchmarks/bench_embedding_cache.py new file mode 100644 index 000000000..b63e93338 --- /dev/null +++ b/benchmarks/bench_embedding_cache.py @@ -0,0 +1,130 @@ +import hashlib +import random +import statistics +import string +import time + +from codebase_rag.embedder import EmbeddingCache + +WARMUP_RUNS = 3 +BENCH_RUNS = 50 +EMBEDDING_DIM = 768 + + +def generate_snippets(count: int, avg_length: int = 200) -> list[str]: + snippets = [] + for i in range(count): + length = avg_length + random.randint(-50, 50) + snippet = "".join(random.choices(string.ascii_letters + string.digits + " \n\t", k=length)) + snippets.append(snippet) + return snippets + + +def generate_embedding() -> list[float]: + return [random.random() for _ in range(EMBEDDING_DIM)] + + +def bench_sha256_hashing(snippets: list[str]) -> float: + start = time.perf_counter() + for s in snippets: + _ = hashlib.sha256(s.encode()).hexdigest() + return time.perf_counter() - start + + +def bench_cache_put(cache: EmbeddingCache, snippets: list[str], embeddings: list[list[float]]) -> float: + start = time.perf_counter() + for s, e in zip(snippets, embeddings): + cache.put(s, e) + return time.perf_counter() - start + + +def bench_cache_get_hit(cache: EmbeddingCache, snippets: list[str]) -> float: + start = time.perf_counter() + for s in snippets: + _ = cache.get(s) + return time.perf_counter() - start + + +def bench_cache_get_miss(cache: EmbeddingCache, miss_snippets: list[str]) -> float: + start = time.perf_counter() + for s in miss_snippets: + _ = cache.get(s) + return time.perf_counter() - start + + +def bench_cache_get_many(cache: EmbeddingCache, snippets: list[str]) -> float: + start = time.perf_counter() + _ = cache.get_many(snippets) + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<40} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 110) + for r in results: + print( + f"{r['name']:<40} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + random.seed(42) + + sizes = [500, 2000, 10000] + + for size in sizes: + print(f"\n{'='*110}") + print(f"EmbeddingCache Benchmark (n={size})") + print(f"{'='*110}") + + snippets = generate_snippets(size) + embeddings = [generate_embedding() for _ in range(size)] + miss_snippets = generate_snippets(size, avg_length=300) + + results = [] + + r = run_benchmark(f"sha256 hashing ({size})", bench_sha256_hashing, snippets) + results.append(r) + + cache = EmbeddingCache() + r = run_benchmark(f"cache.put ({size})", bench_cache_put, cache, snippets, embeddings) + results.append(r) + + cache = EmbeddingCache() + cache.put_many(snippets, embeddings) + + r = run_benchmark(f"cache.get hit ({size})", bench_cache_get_hit, cache, snippets) + results.append(r) + + r = run_benchmark(f"cache.get miss ({size})", bench_cache_get_miss, cache, miss_snippets) + results.append(r) + + r = run_benchmark(f"cache.get_many ({size})", bench_cache_get_many, cache, snippets) + results.append(r) + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_file_hashing.py b/benchmarks/bench_file_hashing.py new file mode 100644 index 000000000..3be76059b --- /dev/null +++ b/benchmarks/bench_file_hashing.py @@ -0,0 +1,138 @@ +import hashlib +import os +import statistics +import tempfile +import time +from pathlib import Path + +WARMUP_RUNS = 3 +BENCH_RUNS = 30 + + +def create_test_files(directory: str, count: int, avg_size_kb: int) -> list[Path]: + paths = [] + for i in range(count): + path = Path(directory) / f"file_{i}.py" + content = os.urandom(avg_size_kb * 1024) + path.write_bytes(content) + paths.append(path) + return paths + + +def hash_file_sha256(filepath: Path) -> str: + hasher = hashlib.sha256() + with filepath.open("rb") as f: + while chunk := f.read(8192): + hasher.update(chunk) + return hasher.hexdigest() + + +def hash_file_sha256_large_buffer(filepath: Path) -> str: + hasher = hashlib.sha256() + with filepath.open("rb") as f: + while chunk := f.read(65536): + hasher.update(chunk) + return hasher.hexdigest() + + +def hash_file_sha256_mmap(filepath: Path) -> str: + import mmap + hasher = hashlib.sha256() + with filepath.open("rb") as f: + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + hasher.update(mm) + return hasher.hexdigest() + + +def hash_file_md5(filepath: Path) -> str: + hasher = hashlib.md5() + with filepath.open("rb") as f: + while chunk := f.read(8192): + hasher.update(chunk) + return hasher.hexdigest() + + +def hash_file_blake2b(filepath: Path) -> str: + hasher = hashlib.blake2b() + with filepath.open("rb") as f: + while chunk := f.read(8192): + hasher.update(chunk) + return hasher.hexdigest() + + +def bench_hash_files(files: list[Path], hash_func) -> float: + start = time.perf_counter() + for f in files: + _ = hash_func(f) + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<45} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 115) + for r in results: + print( + f"{r['name']:<45} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + configs = [ + (50, 5), + (200, 10), + (500, 20), + ] + + for file_count, avg_size_kb in configs: + print(f"\n{'='*115}") + print(f"File Hashing Benchmark (files={file_count}, avg_size={avg_size_kb}KB)") + print(f"{'='*115}") + + with tempfile.TemporaryDirectory() as tmpdir: + files = create_test_files(tmpdir, file_count, avg_size_kb) + total_mb = sum(f.stat().st_size for f in files) / (1024 * 1024) + print(f"Total data: {total_mb:.1f} MB") + + results = [] + + r = run_benchmark(f"sha256 8KB buf ({file_count}f)", bench_hash_files, files, hash_file_sha256) + results.append(r) + + r = run_benchmark(f"sha256 64KB buf ({file_count}f)", bench_hash_files, files, hash_file_sha256_large_buffer) + results.append(r) + + r = run_benchmark(f"sha256 mmap ({file_count}f)", bench_hash_files, files, hash_file_sha256_mmap) + results.append(r) + + r = run_benchmark(f"md5 ({file_count}f)", bench_hash_files, files, hash_file_md5) + results.append(r) + + r = run_benchmark(f"blake2b ({file_count}f)", bench_hash_files, files, hash_file_blake2b) + results.append(r) + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_find_ending_with_fix.py b/benchmarks/bench_find_ending_with_fix.py new file mode 100644 index 000000000..c9ef01cae --- /dev/null +++ b/benchmarks/bench_find_ending_with_fix.py @@ -0,0 +1,218 @@ +import statistics +import time +from collections import defaultdict + +from codebase_rag.graph_updater import FunctionRegistryTrie +from codebase_rag.types_defs import NodeType, SimpleNameLookup + +WARMUP_RUNS = 3 +BENCH_RUNS = 30 + + +def generate_realistic_registry(count: int) -> tuple[list[str], list[str]]: + modules = ["codebase_rag", "utils", "parsers", "services", "tools", "models"] + submodules = ["core", "api", "handlers", "helpers", "base", "factory"] + classes = ["Handler", "Manager", "Factory", "Builder", "Processor", "Resolver", + "Analyzer", "Extractor", "Generator", "Validator"] + methods = ["process", "handle", "create", "build", "resolve", "validate", + "execute", "parse", "extract", "transform", "analyze", "generate", + "find", "get", "set", "update", "delete", "check"] + + qualified_names = [] + for i in range(count): + mod = modules[i % len(modules)] + sub = submodules[(i // len(modules)) % len(submodules)] + cls = classes[(i // (len(modules) * len(submodules))) % len(classes)] + meth = methods[(i // (len(modules) * len(submodules) * len(classes))) % len(methods)] + qualified_names.append(f"{mod}.{sub}.{cls}.method_{i}.{meth}") + + lookup_suffixes = methods + [f"method_{i}" for i in range(0, count, count // 20)] + return qualified_names, lookup_suffixes + + +def bench_linear_scan_endswith(entries: dict[str, NodeType], suffix: str) -> float: + start = time.perf_counter() + _ = [qn for qn in entries.keys() if qn.endswith(f".{suffix}")] + return time.perf_counter() - start + + +def bench_indexed_lookup(lookup: SimpleNameLookup, suffix: str) -> float: + start = time.perf_counter() + _ = list(lookup.get(suffix, set())) + return time.perf_counter() - start + + +def bench_trie_find_ending_with_index_hit( + trie: FunctionRegistryTrie, suffixes: list[str], indexed_suffixes: set[str] +) -> float: + start = time.perf_counter() + for suffix in suffixes: + if suffix in indexed_suffixes: + _ = trie.find_ending_with(suffix) + return time.perf_counter() - start + + +def bench_trie_find_ending_with_index_miss( + trie: FunctionRegistryTrie, suffixes: list[str], indexed_suffixes: set[str] +) -> float: + start = time.perf_counter() + for suffix in suffixes: + if suffix not in indexed_suffixes: + _ = trie.find_ending_with(suffix) + return time.perf_counter() - start + + +def bench_trie_find_ending_with_all( + trie: FunctionRegistryTrie, suffixes: list[str] +) -> float: + start = time.perf_counter() + for suffix in suffixes: + _ = trie.find_ending_with(suffix) + return time.perf_counter() - start + + +def bench_linear_scan_batch(entries: dict[str, NodeType], suffixes: list[str]) -> float: + start = time.perf_counter() + for suffix in suffixes: + _ = [qn for qn in entries.keys() if qn.endswith(f".{suffix}")] + return time.perf_counter() - start + + +def bench_indexed_lookup_batch(lookup: SimpleNameLookup, suffixes: list[str]) -> float: + start = time.perf_counter() + for suffix in suffixes: + _ = list(lookup.get(suffix, set())) + return time.perf_counter() - start + + +def bench_full_suffix_index_batch( + suffix_index: dict[str, set[str]], suffixes: list[str] +) -> float: + start = time.perf_counter() + for suffix in suffixes: + _ = list(suffix_index.get(suffix, set())) + return time.perf_counter() - start + + +def build_full_suffix_index(qualified_names: list[str]) -> dict[str, set[str]]: + index: dict[str, set[str]] = defaultdict(set) + for qn in qualified_names: + simple_name = qn.rsplit(".", 1)[-1] + index[simple_name].add(qn) + return dict(index) + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<55} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 125) + for r in results: + print( + f"{r['name']:<55} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + print("=" * 125) + print("find_ending_with FIX BENCHMARK: Linear Scan vs Indexed Lookup") + print("This benchmarks the #1 CPU hotspot (48.3% of total runtime)") + print("=" * 125) + + sizes = [1000, 4500, 10000] + + for size in sizes: + print(f"\n{'='*125}") + print(f"Registry size: {size} entries") + print(f"{'='*125}") + + qualified_names, lookup_suffixes = generate_realistic_registry(size) + + simple_lookup: SimpleNameLookup = defaultdict(set) + trie = FunctionRegistryTrie(simple_name_lookup=simple_lookup) + for qn in qualified_names: + trie.insert(qn, NodeType.FUNCTION) + simple_name = qn.rsplit(".", 1)[-1] + simple_lookup[simple_name].add(qn) + + full_suffix_index = build_full_suffix_index(qualified_names) + + partially_indexed_suffixes = set(list(simple_lookup.keys())[:len(simple_lookup) // 5]) + miss_suffixes = [s for s in lookup_suffixes if s not in partially_indexed_suffixes] + + results = [] + + print(f"\nSingle-suffix operations (on '{lookup_suffixes[0]}'):") + r = run_benchmark( + f"LINEAR SCAN endswith ({size} entries)", + bench_linear_scan_endswith, dict(trie.items()), lookup_suffixes[0], + ) + results.append(r) + + r = run_benchmark( + f"INDEXED lookup (hit) ({size} entries)", + bench_indexed_lookup, simple_lookup, lookup_suffixes[0], + ) + results.append(r) + + print_results(results) + if results[1]["median_ms"] > 0: + speedup = results[0]["median_ms"] / results[1]["median_ms"] + print(f"\n -> Index hit speedup: {speedup:.0f}x") + + results = [] + num_queries = len(lookup_suffixes) + print(f"\nBatch operations ({num_queries} queries, simulating call resolution):") + + r = run_benchmark( + f"LINEAR SCAN batch ({num_queries}q, {size} entries)", + bench_linear_scan_batch, dict(trie.items()), lookup_suffixes, + ) + results.append(r) + + r = run_benchmark( + f"PARTIAL INDEX batch ({num_queries}q, {size} entries)", + bench_trie_find_ending_with_all, trie, lookup_suffixes, + ) + results.append(r) + + r = run_benchmark( + f"FULL SUFFIX INDEX batch ({num_queries}q, {size} entries)", + bench_full_suffix_index_batch, full_suffix_index, lookup_suffixes, + ) + results.append(r) + + print_results(results) + + if results[2]["median_ms"] > 0: + print(f"\n -> Linear scan vs full index: {results[0]['median_ms'] / results[2]['median_ms']:.0f}x speedup") + print(f" -> Partial index vs full index: {results[1]['median_ms'] / results[2]['median_ms']:.1f}x speedup") + + print(f"\n\n{'='*125}") + print("CONCLUSION: The 48.3% CPU hotspot is caused by linear scans on index misses.") + print("Building a complete suffix index eliminates the bottleneck entirely.") + print("This is a pure Python fix requiring zero FFI, zero new dependencies.") + print(f"{'='*125}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_graph_loader.py b/benchmarks/bench_graph_loader.py new file mode 100644 index 000000000..f93ccd7a4 --- /dev/null +++ b/benchmarks/bench_graph_loader.py @@ -0,0 +1,169 @@ +import json +import statistics +import tempfile +import time +from pathlib import Path + +from codebase_rag.graph_loader import GraphLoader + +WARMUP_RUNS = 2 +BENCH_RUNS = 20 + + +def generate_graph_json(num_nodes: int, num_rels: int) -> str: + nodes = [] + for i in range(num_nodes): + nodes.append({ + "node_id": i, + "labels": ["Function" if i % 3 == 0 else "Class" if i % 3 == 1 else "Module"], + "properties": { + "qualified_name": f"project.module{i // 100}.Class{i // 10}.method{i}", + "name": f"method{i}", + "start_line": i * 10, + "end_line": i * 10 + 9, + }, + }) + + rels = [] + for i in range(num_rels): + rels.append({ + "from_id": i % num_nodes, + "to_id": (i * 7 + 3) % num_nodes, + "type": "CALLS" if i % 2 == 0 else "DEFINES", + "properties": {}, + }) + + graph = { + "nodes": nodes, + "relationships": rels, + "metadata": { + "total_nodes": num_nodes, + "total_relationships": num_rels, + }, + } + return json.dumps(graph) + + +def bench_json_parse(json_str: str) -> float: + start = time.perf_counter() + _ = json.loads(json_str) + return time.perf_counter() - start + + +def bench_graph_load(file_path: str) -> float: + start = time.perf_counter() + loader = GraphLoader(file_path) + loader.load() + return time.perf_counter() - start + + +def bench_find_nodes_by_label(loader: GraphLoader) -> float: + labels = ["Function", "Class", "Module"] + start = time.perf_counter() + for label in labels: + _ = loader.find_nodes_by_label(label) + return time.perf_counter() - start + + +def bench_find_node_by_property(loader: GraphLoader) -> float: + start = time.perf_counter() + for i in range(100): + qn = f"project.module{i}.Class{i * 10 // 10}.method{i * 10}" + _ = loader.find_node_by_property("qualified_name", qn) + return time.perf_counter() - start + + +def bench_get_relationships(loader: GraphLoader, num_nodes: int) -> float: + start = time.perf_counter() + for i in range(min(500, num_nodes)): + _ = loader.get_relationships_for_node(i) + return time.perf_counter() - start + + +def bench_summary(loader: GraphLoader) -> float: + start = time.perf_counter() + _ = loader.summary() + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<40} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 110) + for r in results: + print( + f"{r['name']:<40} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + configs = [ + (1000, 2000), + (5000, 10000), + (20000, 50000), + ] + + for num_nodes, num_rels in configs: + print(f"\n{'='*110}") + print(f"GraphLoader Benchmark (nodes={num_nodes}, rels={num_rels})") + print(f"{'='*110}") + + json_str = generate_graph_json(num_nodes, num_rels) + print(f"JSON size: {len(json_str) / 1024:.1f} KB") + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as tmp: + tmp.write(json_str) + tmp_path = tmp.name + + results = [] + + r = run_benchmark(f"json.loads ({num_nodes}n)", bench_json_parse, json_str) + results.append(r) + + r = run_benchmark(f"GraphLoader.load ({num_nodes}n)", bench_graph_load, tmp_path) + results.append(r) + + loader = GraphLoader(tmp_path) + loader.load() + + r = run_benchmark(f"find_nodes_by_label ({num_nodes}n)", bench_find_nodes_by_label, loader) + results.append(r) + + r = run_benchmark(f"find_node_by_property ({num_nodes}n)", bench_find_node_by_property, loader) + results.append(r) + + r = run_benchmark(f"get_relationships ({num_nodes}n)", bench_get_relationships, loader, num_nodes) + results.append(r) + + r = run_benchmark(f"summary ({num_nodes}n)", bench_summary, loader) + results.append(r) + + print_results(results) + + Path(tmp_path).unlink(missing_ok=True) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_json_serialization.py b/benchmarks/bench_json_serialization.py new file mode 100644 index 000000000..98fc477f7 --- /dev/null +++ b/benchmarks/bench_json_serialization.py @@ -0,0 +1,159 @@ +import json +import statistics +import tempfile +import time +from pathlib import Path + +WARMUP_RUNS = 3 +BENCH_RUNS = 20 + + +def generate_graph_data(num_nodes: int, num_rels: int) -> dict: + nodes = [] + for i in range(num_nodes): + nodes.append({ + "id": i, + "labels": ["Function" if i % 3 == 0 else "Class" if i % 3 == 1 else "Module"], + "properties": { + "qualified_name": f"project.module{i // 100}.Class{i // 10}.method{i}", + "name": f"method{i}", + "start_line": i * 10, + "end_line": i * 10 + 9, + "docstring": f"Method {i} documentation string with some content" if i % 5 == 0 else None, + "decorators": ["staticmethod"] if i % 7 == 0 else [], + "is_exported": i % 4 == 0, + }, + }) + + rels = [] + for i in range(num_rels): + rels.append({ + "from_id": i % num_nodes, + "to_id": (i * 7 + 3) % num_nodes, + "type": "CALLS" if i % 3 == 0 else "DEFINES" if i % 3 == 1 else "IMPORTS", + "properties": {"weight": i % 10} if i % 5 == 0 else {}, + }) + + return { + "nodes": nodes, + "relationships": rels, + "metadata": { + "total_nodes": num_nodes, + "total_relationships": num_rels, + "exported_at": "2026-03-14T10:00:00+00:00", + }, + } + + +def bench_json_dumps(data: dict) -> float: + start = time.perf_counter() + _ = json.dumps(data) + return time.perf_counter() - start + + +def bench_json_dumps_indent(data: dict) -> float: + start = time.perf_counter() + _ = json.dumps(data, indent=2, ensure_ascii=False) + return time.perf_counter() - start + + +def bench_json_loads(json_str: str) -> float: + start = time.perf_counter() + _ = json.loads(json_str) + return time.perf_counter() - start + + +def bench_json_dump_file(data: dict, path: str) -> float: + start = time.perf_counter() + with open(path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + return time.perf_counter() - start + + +def bench_json_load_file(path: str) -> float: + start = time.perf_counter() + with open(path, encoding="utf-8") as f: + _ = json.load(f) + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<45} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 115) + for r in results: + print( + f"{r['name']:<45} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + configs = [ + (1000, 2000), + (5000, 10000), + (20000, 50000), + ] + + for num_nodes, num_rels in configs: + print(f"\n{'='*115}") + print(f"JSON Serialization Benchmark (nodes={num_nodes}, rels={num_rels})") + print(f"{'='*115}") + + data = generate_graph_data(num_nodes, num_rels) + json_str = json.dumps(data) + json_str_indented = json.dumps(data, indent=2, ensure_ascii=False) + print(f"Compact JSON: {len(json_str) / 1024:.1f} KB, Indented: {len(json_str_indented) / 1024:.1f} KB") + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as tmp: + json.dump(data, tmp, indent=2, ensure_ascii=False) + tmp_path = tmp.name + + results = [] + + r = run_benchmark(f"json.dumps compact ({num_nodes}n)", bench_json_dumps, data) + results.append(r) + + r = run_benchmark(f"json.dumps indented ({num_nodes}n)", bench_json_dumps_indent, data) + results.append(r) + + r = run_benchmark(f"json.loads compact ({num_nodes}n)", bench_json_loads, json_str) + results.append(r) + + r = run_benchmark(f"json.loads indented ({num_nodes}n)", bench_json_loads, json_str_indented) + results.append(r) + + r = run_benchmark(f"json.dump to file ({num_nodes}n)", bench_json_dump_file, data, tmp_path) + results.append(r) + + r = run_benchmark(f"json.load from file ({num_nodes}n)", bench_json_load_file, tmp_path) + results.append(r) + + print_results(results) + + Path(tmp_path).unlink(missing_ok=True) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_pathlib_vs_string.py b/benchmarks/bench_pathlib_vs_string.py new file mode 100644 index 000000000..1794b2cef --- /dev/null +++ b/benchmarks/bench_pathlib_vs_string.py @@ -0,0 +1,214 @@ +import os +import statistics +import time +from pathlib import Path, PurePosixPath + +WARMUP_RUNS = 3 +BENCH_RUNS = 50 + + +def generate_file_paths(repo_root: str, count: int) -> list[str]: + dirs = ["src", "lib", "utils", "core", "parsers", "services", "tools", "tests"] + subdirs = ["base", "handlers", "helpers", "models", "schemas", "config"] + extensions = [".py", ".js", ".ts", ".rs", ".go", ".java", ".cpp"] + + paths = [] + for i in range(count): + d = dirs[i % len(dirs)] + sd = subdirs[(i // len(dirs)) % len(subdirs)] + ext = extensions[(i // (len(dirs) * len(subdirs))) % len(extensions)] + paths.append(f"{repo_root}/{d}/{sd}/module_{i}{ext}") + return paths + + +def generate_skip_patterns() -> list[str]: + return [ + "node_modules", ".git", "__pycache__", ".venv", "dist", "build", + ".mypy_cache", ".pytest_cache", ".tox", "egg-info", + ] + + +def bench_pathlib_relative_to(paths: list[str], repo_root: str) -> float: + repo_path = Path(repo_root) + start = time.perf_counter() + for p in paths: + path = Path(p) + _ = path.relative_to(repo_path) + return time.perf_counter() - start + + +def bench_string_removeprefix(paths: list[str], repo_root: str) -> float: + prefix = repo_root + "/" + start = time.perf_counter() + for p in paths: + _ = p.removeprefix(prefix) + return time.perf_counter() - start + + +def bench_os_path_relpath(paths: list[str], repo_root: str) -> float: + start = time.perf_counter() + for p in paths: + _ = os.path.relpath(p, repo_root) + return time.perf_counter() - start + + +def bench_pathlib_should_skip(paths: list[str], repo_root: str, skip_patterns: list[str]) -> float: + repo_path = Path(repo_root) + skip_set = set(skip_patterns) + start = time.perf_counter() + for p in paths: + path = Path(p) + try: + relative = path.relative_to(repo_path) + parts = relative.parts + _ = any(part in skip_set for part in parts) + except ValueError: + pass + return time.perf_counter() - start + + +def bench_string_should_skip(paths: list[str], repo_root: str, skip_patterns: list[str]) -> float: + prefix = repo_root + "/" + skip_set = set(skip_patterns) + start = time.perf_counter() + for p in paths: + relative = p.removeprefix(prefix) + parts = relative.split("/") + _ = any(part in skip_set for part in parts) + return time.perf_counter() - start + + +def bench_pathlib_suffix_check(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + path = Path(p) + _ = path.suffix + return time.perf_counter() - start + + +def bench_string_suffix_check(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + dot_idx = p.rfind(".") + _ = p[dot_idx:] if dot_idx >= 0 else "" + return time.perf_counter() - start + + +def bench_os_path_splitext(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + _, _ = os.path.splitext(p) + return time.perf_counter() - start + + +def bench_pathlib_name(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + path = Path(p) + _ = path.name + return time.perf_counter() - start + + +def bench_string_name(paths: list[str]) -> float: + start = time.perf_counter() + for p in paths: + slash_idx = p.rfind("/") + _ = p[slash_idx + 1:] if slash_idx >= 0 else p + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<55} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 125) + for r in results: + print( + f"{r['name']:<55} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + print("=" * 125) + print("pathlib vs String Operations Benchmark") + print("This benchmarks the #2 CPU hotspot (13.7% of total runtime)") + print("=" * 125) + + repo_root = "/Users/developer/projects/large-repo" + skip_patterns = generate_skip_patterns() + + for count in [1000, 5000, 20000, 59012]: + print(f"\n{'='*125}") + print(f"Path count: {count} (59012 = actual profiled call count)") + print(f"{'='*125}") + + paths = generate_file_paths(repo_root, count) + + results = [] + + print("\n--- relative_to vs removeprefix ---") + r1 = run_benchmark(f"pathlib.relative_to ({count}p)", bench_pathlib_relative_to, paths, repo_root) + results.append(r1) + r2 = run_benchmark(f"str.removeprefix ({count}p)", bench_string_removeprefix, paths, repo_root) + results.append(r2) + r3 = run_benchmark(f"os.path.relpath ({count}p)", bench_os_path_relpath, paths, repo_root) + results.append(r3) + + print_results(results) + print(f"\n -> pathlib vs str.removeprefix: {r1['median_ms'] / r2['median_ms']:.0f}x slower") + print(f" -> pathlib vs os.path.relpath: {r1['median_ms'] / r3['median_ms']:.1f}x slower") + + results = [] + print("\n--- should_skip_path (full function) ---") + r1 = run_benchmark(f"pathlib should_skip ({count}p)", bench_pathlib_should_skip, paths, repo_root, skip_patterns) + results.append(r1) + r2 = run_benchmark(f"string should_skip ({count}p)", bench_string_should_skip, paths, repo_root, skip_patterns) + results.append(r2) + + print_results(results) + print(f"\n -> pathlib vs string: {r1['median_ms'] / r2['median_ms']:.1f}x slower") + + results = [] + print("\n--- Suffix/extension extraction ---") + r1 = run_benchmark(f"Path.suffix ({count}p)", bench_pathlib_suffix_check, paths) + results.append(r1) + r2 = run_benchmark(f"str.rfind ({count}p)", bench_string_suffix_check, paths) + results.append(r2) + r3 = run_benchmark(f"os.path.splitext ({count}p)", bench_os_path_splitext, paths) + results.append(r3) + + print_results(results) + print(f"\n -> Path.suffix vs str.rfind: {r1['median_ms'] / r2['median_ms']:.1f}x slower") + + results = [] + print("\n--- Filename extraction ---") + r1 = run_benchmark(f"Path.name ({count}p)", bench_pathlib_name, paths) + results.append(r1) + r2 = run_benchmark(f"str.rfind+slice ({count}p)", bench_string_name, paths) + results.append(r2) + + print_results(results) + print(f"\n -> Path.name vs str: {r1['median_ms'] / r2['median_ms']:.1f}x slower") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_string_ops.py b/benchmarks/bench_string_ops.py new file mode 100644 index 000000000..cc10e91f8 --- /dev/null +++ b/benchmarks/bench_string_ops.py @@ -0,0 +1,148 @@ +import re +import statistics +import time + +WARMUP_RUNS = 3 +BENCH_RUNS = 100 + +SEPARATOR_PATTERN = re.compile(r"[.:]|::") + + +def generate_qualified_names(count: int) -> list[str]: + names = [] + modules = ["project", "utils", "core", "api", "services", "models"] + classes = ["Handler", "Manager", "Factory", "Builder", "Processor", "Resolver"] + methods = ["process", "handle", "create", "build", "resolve", "validate"] + for i in range(count): + mod = modules[i % len(modules)] + cls = classes[(i // len(modules)) % len(classes)] + meth = methods[(i // (len(modules) * len(classes))) % len(methods)] + names.append(f"{mod}.{cls}.sub{i}.{meth}") + return names + + +def bench_str_split(names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = name.split(".") + return time.perf_counter() - start + + +def bench_str_endswith(names: list[str]) -> float: + suffixes = [".process", ".handle", ".create", ".build", ".resolve"] + start = time.perf_counter() + for name in names: + for suffix in suffixes: + _ = name.endswith(suffix) + return time.perf_counter() - start + + +def bench_str_startswith(names: list[str]) -> float: + prefixes = ["project.", "utils.", "core.", "api."] + start = time.perf_counter() + for name in names: + for prefix in prefixes: + _ = name.startswith(prefix) + return time.perf_counter() - start + + +def bench_str_join(names: list[str]) -> float: + split_names = [name.split(".") for name in names] + start = time.perf_counter() + for parts in split_names: + _ = ".".join(parts) + return time.perf_counter() - start + + +def bench_str_replace(names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = name.replace("/", ".") + return time.perf_counter() - start + + +def bench_regex_split(names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = SEPARATOR_PATTERN.split(name) + return time.perf_counter() - start + + +def bench_str_format(names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = f"module.{name}.method" + return time.perf_counter() - start + + +def bench_import_distance(names: list[str]) -> float: + start = time.perf_counter() + for i in range(0, len(names) - 1, 2): + caller_parts = names[i].split(".") + candidate_parts = names[i + 1].split(".") + common = 0 + for j in range(min(len(caller_parts), len(candidate_parts))): + if caller_parts[j] == candidate_parts[j]: + common += 1 + else: + break + _ = max(len(caller_parts), len(candidate_parts)) - common + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<40} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 110) + for r in results: + print( + f"{r['name']:<40} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + sizes = [1000, 5000, 20000] + + for size in sizes: + print(f"\n{'='*110}") + print(f"String Operations Benchmark (n={size})") + print(f"{'='*110}") + + names = generate_qualified_names(size) + + results = [ + run_benchmark(f"str.split ({size})", bench_str_split, names), + run_benchmark(f"str.endswith ({size})", bench_str_endswith, names), + run_benchmark(f"str.startswith ({size})", bench_str_startswith, names), + run_benchmark(f"str.join ({size})", bench_str_join, names), + run_benchmark(f"str.replace ({size})", bench_str_replace, names), + run_benchmark(f"regex split ({size})", bench_regex_split, names), + run_benchmark(f"f-string format ({size})", bench_str_format, names), + run_benchmark(f"import_distance ({size})", bench_import_distance, names), + ] + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_trie.py b/benchmarks/bench_trie.py new file mode 100644 index 000000000..dba339100 --- /dev/null +++ b/benchmarks/bench_trie.py @@ -0,0 +1,138 @@ +import statistics +import time +from collections import defaultdict + +from codebase_rag.graph_updater import FunctionRegistryTrie +from codebase_rag.types_defs import NodeType, SimpleNameLookup + +WARMUP_RUNS = 3 +BENCH_RUNS = 50 + + +def generate_qualified_names(count: int) -> list[str]: + names = [] + modules = ["project", "utils", "core", "api", "services", "models"] + classes = ["Handler", "Manager", "Factory", "Builder", "Processor", "Resolver"] + methods = ["process", "handle", "create", "build", "resolve", "validate", "execute"] + for i in range(count): + mod = modules[i % len(modules)] + cls = classes[(i // len(modules)) % len(classes)] + meth = methods[(i // (len(modules) * len(classes))) % len(methods)] + sub = f"sub{i}" + names.append(f"{mod}.{cls}.{sub}.{meth}") + return names + + +def bench_insert(trie: FunctionRegistryTrie, names: list[str]) -> float: + start = time.perf_counter() + for name in names: + trie.insert(name, NodeType.FUNCTION) + return time.perf_counter() - start + + +def bench_lookup(trie: FunctionRegistryTrie, names: list[str]) -> float: + start = time.perf_counter() + for name in names: + _ = name in trie + return time.perf_counter() - start + + +def bench_find_ending_with(trie: FunctionRegistryTrie) -> float: + suffixes = ["process", "handle", "create", "build", "resolve", "validate", "execute"] + start = time.perf_counter() + for suffix in suffixes: + _ = trie.find_ending_with(suffix) + return time.perf_counter() - start + + +def bench_find_with_prefix(trie: FunctionRegistryTrie) -> float: + prefixes = ["project", "utils", "core", "api", "services", "models"] + start = time.perf_counter() + for prefix in prefixes: + _ = trie.find_with_prefix(prefix) + return time.perf_counter() - start + + +def bench_delete(names: list[str]) -> float: + simple_lookup: SimpleNameLookup = defaultdict(set) + trie = FunctionRegistryTrie(simple_name_lookup=simple_lookup) + for name in names: + trie.insert(name, NodeType.FUNCTION) + simple_name = name.split(".")[-1] + simple_lookup[simple_name].add(name) + + start = time.perf_counter() + for name in names[:len(names) // 4]: + del trie[name] + return time.perf_counter() - start + + +def run_benchmark(name: str, func, *args) -> dict[str, float]: + for _ in range(WARMUP_RUNS): + func(*args) + + times = [] + for _ in range(BENCH_RUNS): + times.append(func(*args)) + + return { + "name": name, + "median_ms": statistics.median(times) * 1000, + "mean_ms": statistics.mean(times) * 1000, + "stddev_ms": statistics.stdev(times) * 1000 if len(times) > 1 else 0, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "p95_ms": sorted(times)[int(len(times) * 0.95)] * 1000, + } + + +def print_results(results: list[dict[str, float]]) -> None: + print(f"\n{'Benchmark':<35} {'Median':>10} {'Mean':>10} {'StdDev':>10} {'Min':>10} {'Max':>10} {'P95':>10}") + print("-" * 105) + for r in results: + print( + f"{r['name']:<35} {r['median_ms']:>9.3f}ms {r['mean_ms']:>9.3f}ms " + f"{r['stddev_ms']:>9.3f}ms {r['min_ms']:>9.3f}ms {r['max_ms']:>9.3f}ms " + f"{r['p95_ms']:>9.3f}ms" + ) + + +def main() -> None: + sizes = [1000, 5000, 10000, 50000] + + for size in sizes: + print(f"\n{'='*105}") + print(f"FunctionRegistryTrie Benchmark (n={size})") + print(f"{'='*105}") + + names = generate_qualified_names(size) + + simple_lookup: SimpleNameLookup = defaultdict(set) + trie = FunctionRegistryTrie(simple_name_lookup=simple_lookup) + + results = [] + + r = run_benchmark(f"insert ({size})", bench_insert, trie, names) + results.append(r) + + for name in names: + simple_name = name.split(".")[-1] + simple_lookup[simple_name].add(name) + + r = run_benchmark(f"lookup ({size})", bench_lookup, trie, names) + results.append(r) + + r = run_benchmark(f"find_ending_with ({size})", bench_find_ending_with, trie) + results.append(r) + + r = run_benchmark(f"find_with_prefix ({size})", bench_find_with_prefix, trie) + results.append(r) + + r = run_benchmark(f"delete 25% ({size})", bench_delete, names) + results.append(r) + + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/results/bench_ast_cache_20260315_000043.txt b/benchmarks/results/bench_ast_cache_20260315_000043.txt new file mode 100644 index 000000000..5084d79ef --- /dev/null +++ b/benchmarks/results/bench_ast_cache_20260315_000043.txt @@ -0,0 +1,42 @@ +Benchmark: bench_ast_cache.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 2.2s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +=================================================================================================================== +BoundedASTCache Benchmark (entries=500, item_size=1024B) +=================================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +insert (500) 1.119ms 1.128ms 0.020ms 1.113ms 1.229ms 1.158ms +lookup (500) 0.019ms 0.019ms 0.000ms 0.018ms 0.019ms 0.019ms +access+LRU (500) 0.053ms 0.053ms 0.000ms 0.053ms 0.056ms 0.053ms +insert+evict (max=250) 1.141ms 1.155ms 0.092ms 1.133ms 1.792ms 1.158ms +getsizeof scan (500) 0.062ms 0.062ms 0.001ms 0.061ms 0.067ms 0.062ms + +=================================================================================================================== +BoundedASTCache Benchmark (entries=2000, item_size=4096B) +=================================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +insert (2000) 4.717ms 4.798ms 0.248ms 4.591ms 5.567ms 5.558ms +lookup (2000) 0.077ms 0.077ms 0.000ms 0.076ms 0.078ms 0.077ms +access+LRU (2000) 0.214ms 0.214ms 0.001ms 0.213ms 0.217ms 0.216ms +insert+evict (max=1000) 4.768ms 4.814ms 0.221ms 4.614ms 5.870ms 5.103ms +getsizeof scan (2000) 0.257ms 0.259ms 0.005ms 0.254ms 0.279ms 0.269ms + +=================================================================================================================== +BoundedASTCache Benchmark (entries=5000, item_size=8192B) +=================================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +insert (5000) 12.829ms 13.137ms 0.611ms 12.561ms 14.340ms 14.280ms +lookup (5000) 0.206ms 0.206ms 0.002ms 0.203ms 0.210ms 0.209ms +access+LRU (5000) 0.551ms 0.552ms 0.005ms 0.544ms 0.565ms 0.563ms +insert+evict (max=2500) 12.558ms 12.992ms 0.936ms 12.246ms 16.534ms 14.787ms +getsizeof scan (5000) 0.681ms 0.686ms 0.027ms 0.651ms 0.812ms 0.740ms diff --git a/benchmarks/results/bench_embedding_cache_20260315_000043.txt b/benchmarks/results/bench_embedding_cache_20260315_000043.txt new file mode 100644 index 000000000..807a58402 --- /dev/null +++ b/benchmarks/results/bench_embedding_cache_20260315_000043.txt @@ -0,0 +1,42 @@ +Benchmark: bench_embedding_cache.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 3.4s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +============================================================================================================== +EmbeddingCache Benchmark (n=500) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +sha256 hashing (500) 0.155ms 0.151ms 0.006ms 0.143ms 0.161ms 0.159ms +cache.put (500) 0.182ms 0.182ms 0.002ms 0.179ms 0.187ms 0.185ms +cache.get hit (500) 0.177ms 0.177ms 0.001ms 0.176ms 0.180ms 0.179ms +cache.get miss (500) 0.190ms 0.192ms 0.003ms 0.189ms 0.207ms 0.195ms +cache.get_many (500) 0.190ms 0.190ms 0.001ms 0.189ms 0.193ms 0.191ms + +============================================================================================================== +EmbeddingCache Benchmark (n=2000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +sha256 hashing (2000) 0.562ms 0.564ms 0.006ms 0.557ms 0.581ms 0.576ms +cache.put (2000) 0.751ms 0.760ms 0.027ms 0.738ms 0.918ms 0.794ms +cache.get hit (2000) 0.729ms 0.732ms 0.009ms 0.719ms 0.765ms 0.748ms +cache.get miss (2000) 0.797ms 0.801ms 0.026ms 0.771ms 0.866ms 0.839ms +cache.get_many (2000) 0.798ms 0.808ms 0.028ms 0.777ms 0.888ms 0.856ms + +============================================================================================================== +EmbeddingCache Benchmark (n=10000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +sha256 hashing (10000) 2.884ms 2.875ms 0.034ms 2.815ms 2.950ms 2.921ms +cache.put (10000) 3.790ms 3.786ms 0.024ms 3.729ms 3.827ms 3.821ms +cache.get hit (10000) 3.690ms 3.697ms 0.029ms 3.653ms 3.775ms 3.750ms +cache.get miss (10000) 3.939ms 3.943ms 0.041ms 3.878ms 4.079ms 4.018ms +cache.get_many (10000) 3.987ms 3.989ms 0.023ms 3.948ms 4.051ms 4.041ms diff --git a/benchmarks/results/bench_file_hashing_20260315_000043.txt b/benchmarks/results/bench_file_hashing_20260315_000043.txt new file mode 100644 index 000000000..6346ad2f7 --- /dev/null +++ b/benchmarks/results/bench_file_hashing_20260315_000043.txt @@ -0,0 +1,45 @@ +Benchmark: bench_file_hashing.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 4.4s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +=================================================================================================================== +File Hashing Benchmark (files=50, avg_size=5KB) +=================================================================================================================== +Total data: 0.2 MB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +sha256 8KB buf (50f) 1.006ms 1.016ms 0.043ms 0.977ms 1.186ms 1.146ms +sha256 64KB buf (50f) 1.075ms 1.070ms 0.016ms 1.036ms 1.106ms 1.090ms +sha256 mmap (50f) 1.356ms 1.355ms 0.033ms 1.299ms 1.453ms 1.395ms +md5 (50f) 1.310ms 1.374ms 0.171ms 1.191ms 1.878ms 1.727ms +blake2b (50f) 1.201ms 1.253ms 0.147ms 1.106ms 1.718ms 1.632ms + +=================================================================================================================== +File Hashing Benchmark (files=200, avg_size=10KB) +=================================================================================================================== +Total data: 2.0 MB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +sha256 8KB buf (200f) 4.587ms 4.777ms 0.512ms 4.377ms 6.201ms 6.185ms +sha256 64KB buf (200f) 4.729ms 4.819ms 0.285ms 4.557ms 5.794ms 5.706ms +sha256 mmap (200f) 5.984ms 8.714ms 11.275ms 5.650ms 63.888ms 29.536ms +md5 (200f) 6.532ms 6.547ms 0.143ms 6.367ms 6.993ms 6.804ms +blake2b (200f) 5.217ms 5.289ms 0.272ms 5.068ms 6.416ms 6.003ms + +=================================================================================================================== +File Hashing Benchmark (files=500, avg_size=20KB) +=================================================================================================================== +Total data: 9.8 MB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +sha256 8KB buf (500f) 13.926ms 14.170ms 0.910ms 13.581ms 18.406ms 15.773ms +sha256 64KB buf (500f) 14.268ms 14.312ms 0.253ms 13.957ms 15.319ms 14.640ms +sha256 mmap (500f) 16.699ms 20.110ms 15.978ms 16.299ms 104.163ms 25.618ms +md5 (500f) 23.512ms 23.670ms 0.567ms 23.157ms 25.836ms 25.075ms +blake2b (500f) 17.669ms 17.783ms 0.496ms 17.229ms 19.433ms 18.815ms diff --git a/benchmarks/results/bench_graph_loader_20260315_000043.txt b/benchmarks/results/bench_graph_loader_20260315_000043.txt new file mode 100644 index 000000000..d9cd28a0b --- /dev/null +++ b/benchmarks/results/bench_graph_loader_20260315_000043.txt @@ -0,0 +1,48 @@ +Benchmark: bench_graph_loader.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 2.9s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +============================================================================================================== +GraphLoader Benchmark (nodes=1000, rels=2000) +============================================================================================================== +JSON size: 298.2 KB + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +json.loads (1000n) 1.001ms 1.011ms 0.029ms 0.974ms 1.071ms 1.071ms +GraphLoader.load (1000n) 2.040ms 2.143ms 0.583ms 1.865ms 4.581ms 4.581ms +find_nodes_by_label (1000n) 0.001ms 0.001ms 0.000ms 0.000ms 0.001ms 0.001ms +find_node_by_property (1000n) 0.030ms 0.030ms 0.000ms 0.029ms 0.030ms 0.030ms +get_relationships (1000n) 0.148ms 0.148ms 0.001ms 0.146ms 0.151ms 0.151ms +summary (1000n) 0.069ms 0.070ms 0.001ms 0.068ms 0.073ms 0.073ms + +============================================================================================================== +GraphLoader Benchmark (nodes=5000, rels=10000) +============================================================================================================== +JSON size: 1537.8 KB + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +json.loads (5000n) 5.032ms 5.002ms 0.112ms 4.843ms 5.180ms 5.180ms +GraphLoader.load (5000n) 10.106ms 11.137ms 2.030ms 9.396ms 14.997ms 14.997ms +find_nodes_by_label (5000n) 0.000ms 0.000ms 0.000ms 0.000ms 0.001ms 0.001ms +find_node_by_property (5000n) 0.030ms 0.030ms 0.000ms 0.030ms 0.030ms 0.030ms +get_relationships (5000n) 0.150ms 0.152ms 0.005ms 0.148ms 0.170ms 0.170ms +summary (5000n) 0.350ms 0.356ms 0.018ms 0.341ms 0.420ms 0.420ms + +============================================================================================================== +GraphLoader Benchmark (nodes=20000, rels=50000) +============================================================================================================== +JSON size: 6979.7 KB + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +json.loads (20000n) 24.136ms 24.783ms 2.550ms 23.565ms 35.321ms 35.321ms +GraphLoader.load (20000n) 61.008ms 62.676ms 5.050ms 57.534ms 75.337ms 75.337ms +find_nodes_by_label (20000n) 0.000ms 0.000ms 0.000ms 0.000ms 0.001ms 0.001ms +find_node_by_property (20000n) 0.030ms 0.030ms 0.000ms 0.030ms 0.030ms 0.030ms +get_relationships (20000n) 0.152ms 0.153ms 0.001ms 0.151ms 0.155ms 0.155ms +summary (20000n) 1.738ms 1.745ms 0.023ms 1.714ms 1.819ms 1.819ms diff --git a/benchmarks/results/bench_json_serialization_20260315_000043.txt b/benchmarks/results/bench_json_serialization_20260315_000043.txt new file mode 100644 index 000000000..aab002921 --- /dev/null +++ b/benchmarks/results/bench_json_serialization_20260315_000043.txt @@ -0,0 +1,48 @@ +Benchmark: bench_json_serialization.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 18.8s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +=================================================================================================================== +JSON Serialization Benchmark (nodes=1000, rels=2000) +=================================================================================================================== +Compact JSON: 366.8 KB, Indented: 547.7 KB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +json.dumps compact (1000n) 1.089ms 1.094ms 0.010ms 1.084ms 1.117ms 1.117ms +json.dumps indented (1000n) 9.612ms 9.703ms 0.220ms 9.560ms 10.479ms 10.479ms +json.loads compact (1000n) 1.202ms 1.202ms 0.015ms 1.185ms 1.260ms 1.260ms +json.loads indented (1000n) 1.286ms 1.281ms 0.023ms 1.253ms 1.325ms 1.325ms +json.dump to file (1000n) 12.239ms 12.241ms 0.071ms 12.145ms 12.398ms 12.398ms +json.load from file (1000n) 1.345ms 1.350ms 0.036ms 1.309ms 1.429ms 1.429ms + +=================================================================================================================== +JSON Serialization Benchmark (nodes=5000, rels=10000) +=================================================================================================================== +Compact JSON: 1881.4 KB, Indented: 2786.1 KB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +json.dumps compact (5000n) 5.701ms 5.718ms 0.158ms 5.464ms 6.000ms 6.000ms +json.dumps indented (5000n) 47.875ms 47.950ms 0.285ms 47.618ms 48.611ms 48.611ms +json.loads compact (5000n) 6.291ms 6.327ms 0.244ms 5.999ms 6.754ms 6.754ms +json.loads indented (5000n) 6.686ms 6.666ms 0.263ms 6.346ms 7.152ms 7.152ms +json.dump to file (5000n) 60.552ms 60.895ms 1.262ms 60.082ms 64.565ms 64.565ms +json.load from file (5000n) 6.573ms 6.590ms 0.049ms 6.528ms 6.717ms 6.717ms + +=================================================================================================================== +JSON Serialization Benchmark (nodes=20000, rels=50000) +=================================================================================================================== +Compact JSON: 8381.6 KB, Indented: 12363.2 KB + +Benchmark Median Mean StdDev Min Max P95 +------------------------------------------------------------------------------------------------------------------- +json.dumps compact (20000n) 25.446ms 25.483ms 0.156ms 25.314ms 25.797ms 25.797ms +json.dumps indented (20000n) 215.190ms 215.593ms 1.383ms 214.183ms 219.350ms 219.350ms +json.loads compact (20000n) 28.713ms 28.731ms 0.480ms 28.049ms 30.253ms 30.253ms +json.loads indented (20000n) 30.416ms 30.558ms 0.813ms 29.707ms 32.258ms 32.258ms +json.dump to file (20000n) 271.376ms 271.918ms 3.051ms 266.710ms 278.494ms 278.494ms +json.load from file (20000n) 32.144ms 33.111ms 3.488ms 31.594ms 47.762ms 47.762ms diff --git a/benchmarks/results/bench_string_ops_20260315_000043.txt b/benchmarks/results/bench_string_ops_20260315_000043.txt new file mode 100644 index 000000000..66c1bcd8b --- /dev/null +++ b/benchmarks/results/bench_string_ops_20260315_000043.txt @@ -0,0 +1,51 @@ +Benchmark: bench_string_ops.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 3.2s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +============================================================================================================== +String Operations Benchmark (n=1000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +str.split (1000) 0.079ms 0.079ms 0.001ms 0.077ms 0.083ms 0.082ms +str.endswith (1000) 0.179ms 0.181ms 0.006ms 0.174ms 0.219ms 0.188ms +str.startswith (1000) 0.146ms 0.147ms 0.003ms 0.144ms 0.165ms 0.150ms +str.join (1000) 0.036ms 0.036ms 0.001ms 0.035ms 0.047ms 0.039ms +str.replace (1000) 0.014ms 0.014ms 0.000ms 0.014ms 0.016ms 0.014ms +regex split (1000) 0.418ms 0.420ms 0.006ms 0.414ms 0.437ms 0.431ms +f-string format (1000) 0.029ms 0.029ms 0.000ms 0.029ms 0.032ms 0.029ms +import_distance (1000) 0.164ms 0.165ms 0.004ms 0.162ms 0.185ms 0.171ms + +============================================================================================================== +String Operations Benchmark (n=5000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +str.split (5000) 0.380ms 0.380ms 0.003ms 0.371ms 0.395ms 0.387ms +str.endswith (5000) 0.897ms 0.899ms 0.004ms 0.892ms 0.919ms 0.909ms +str.startswith (5000) 0.722ms 0.723ms 0.003ms 0.715ms 0.733ms 0.728ms +str.join (5000) 0.185ms 0.187ms 0.005ms 0.184ms 0.234ms 0.191ms +str.replace (5000) 0.071ms 0.071ms 0.001ms 0.070ms 0.074ms 0.071ms +regex split (5000) 2.033ms 2.037ms 0.023ms 1.984ms 2.103ms 2.076ms +f-string format (5000) 0.146ms 0.147ms 0.002ms 0.145ms 0.154ms 0.150ms +import_distance (5000) 0.781ms 0.773ms 0.014ms 0.752ms 0.797ms 0.790ms + +============================================================================================================== +String Operations Benchmark (n=20000) +============================================================================================================== + +Benchmark Median Mean StdDev Min Max P95 +-------------------------------------------------------------------------------------------------------------- +str.split (20000) 1.588ms 1.590ms 0.014ms 1.559ms 1.626ms 1.612ms +str.endswith (20000) 3.582ms 3.619ms 0.147ms 3.497ms 4.883ms 3.803ms +str.startswith (20000) 2.920ms 2.926ms 0.031ms 2.876ms 3.064ms 3.005ms +str.join (20000) 0.733ms 0.735ms 0.015ms 0.719ms 0.850ms 0.752ms +str.replace (20000) 0.287ms 0.288ms 0.009ms 0.282ms 0.374ms 0.293ms +regex split (20000) 8.051ms 8.047ms 0.068ms 7.924ms 8.195ms 8.174ms +f-string format (20000) 0.593ms 0.594ms 0.006ms 0.582ms 0.624ms 0.603ms +import_distance (20000) 3.183ms 3.184ms 0.039ms 3.129ms 3.315ms 3.262ms diff --git a/benchmarks/results/bench_trie_20260315_000043.txt b/benchmarks/results/bench_trie_20260315_000043.txt new file mode 100644 index 000000000..10ad3978e --- /dev/null +++ b/benchmarks/results/bench_trie_20260315_000043.txt @@ -0,0 +1,54 @@ +Benchmark: bench_trie.py +Timestamp: 20260315_000043 +Exit code: 0 +Duration: 9.3s +Python: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ] +================================================================================ + +========================================================================================================= +FunctionRegistryTrie Benchmark (n=1000) +========================================================================================================= + +Benchmark Median Mean StdDev Min Max P95 +--------------------------------------------------------------------------------------------------------- +insert (1000) 0.340ms 0.341ms 0.012ms 0.327ms 0.385ms 0.378ms +lookup (1000) 0.036ms 0.036ms 0.000ms 0.035ms 0.037ms 0.036ms +find_ending_with (1000) 0.004ms 0.005ms 0.004ms 0.004ms 0.031ms 0.004ms +find_with_prefix (1000) 0.390ms 0.425ms 0.059ms 0.369ms 0.589ms 0.528ms +delete 25% (1000) 0.407ms 0.418ms 0.021ms 0.394ms 0.457ms 0.449ms + +========================================================================================================= +FunctionRegistryTrie Benchmark (n=5000) +========================================================================================================= + +Benchmark Median Mean StdDev Min Max P95 +--------------------------------------------------------------------------------------------------------- +insert (5000) 1.795ms 1.797ms 0.037ms 1.721ms 1.911ms 1.876ms +lookup (5000) 0.195ms 0.196ms 0.002ms 0.193ms 0.201ms 0.200ms +find_ending_with (5000) 0.019ms 0.019ms 0.000ms 0.018ms 0.021ms 0.019ms +find_with_prefix (5000) 2.104ms 2.299ms 1.047ms 2.024ms 9.499ms 2.416ms +delete 25% (5000) 2.116ms 2.122ms 0.048ms 2.043ms 2.260ms 2.214ms + +========================================================================================================= +FunctionRegistryTrie Benchmark (n=10000) +========================================================================================================= + +Benchmark Median Mean StdDev Min Max P95 +--------------------------------------------------------------------------------------------------------- +insert (10000) 3.709ms 3.735ms 0.106ms 3.627ms 4.244ms 3.912ms +lookup (10000) 0.402ms 0.403ms 0.003ms 0.398ms 0.412ms 0.407ms +find_ending_with (10000) 0.046ms 0.046ms 0.002ms 0.045ms 0.056ms 0.050ms +find_with_prefix (10000) 4.244ms 4.630ms 1.843ms 3.904ms 13.674ms 5.386ms +delete 25% (10000) 4.204ms 4.207ms 0.066ms 3.959ms 4.349ms 4.312ms + +========================================================================================================= +FunctionRegistryTrie Benchmark (n=50000) +========================================================================================================= + +Benchmark Median Mean StdDev Min Max P95 +--------------------------------------------------------------------------------------------------------- +insert (50000) 18.036ms 18.128ms 0.306ms 17.831ms 18.972ms 18.820ms +lookup (50000) 2.058ms 2.061ms 0.013ms 2.036ms 2.091ms 2.085ms +find_ending_with (50000) 0.420ms 0.426ms 0.014ms 0.412ms 0.477ms 0.458ms +find_with_prefix (50000) 38.507ms 38.096ms 10.219ms 22.462ms 56.890ms 52.739ms +delete 25% (50000) 21.744ms 21.830ms 0.410ms 21.277ms 23.496ms 22.524ms diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py new file mode 100644 index 000000000..a79c339ab --- /dev/null +++ b/benchmarks/run_all.py @@ -0,0 +1,74 @@ +import subprocess +import sys +import time +from pathlib import Path + +BENCHMARKS = [ + "bench_string_ops.py", + "bench_trie.py", + "bench_find_ending_with_fix.py", + "bench_dropin_replacements.py", + "bench_graph_loader.py", + "bench_file_hashing.py", + "bench_embedding_cache.py", + "bench_json_serialization.py", + "bench_ast_cache.py", + "bench_pathlib_vs_string.py", +] + + +def main() -> None: + bench_dir = Path(__file__).parent + results_dir = bench_dir / "results" + results_dir.mkdir(exist_ok=True) + + timestamp = time.strftime("%Y%m%d_%H%M%S") + overall_start = time.perf_counter() + + print(f"Running {len(BENCHMARKS)} benchmark suites") + print(f"Results will be saved to: {results_dir}") + print(f"Timestamp: {timestamp}") + print("=" * 80) + + for bench_file in BENCHMARKS: + bench_path = bench_dir / bench_file + if not bench_path.exists(): + print(f"SKIP: {bench_file} (not found)") + continue + + result_file = results_dir / f"{bench_path.stem}_{timestamp}.txt" + print(f"\nRunning: {bench_file}") + + start = time.perf_counter() + result = subprocess.run( + [sys.executable, str(bench_path)], + capture_output=True, + text=True, + timeout=600, + ) + elapsed = time.perf_counter() - start + + output = result.stdout + if result.returncode != 0: + output += f"\nSTDERR:\n{result.stderr}" + print(f" FAILED (exit code {result.returncode}, {elapsed:.1f}s)") + else: + print(f" OK ({elapsed:.1f}s)") + + with result_file.open("w") as f: + f.write(f"Benchmark: {bench_file}\n") + f.write(f"Timestamp: {timestamp}\n") + f.write(f"Exit code: {result.returncode}\n") + f.write(f"Duration: {elapsed:.1f}s\n") + f.write(f"Python: {sys.version}\n") + f.write("=" * 80 + "\n") + f.write(output) + + total = time.perf_counter() - overall_start + print(f"\n{'='*80}") + print(f"All benchmarks completed in {total:.1f}s") + print(f"Results saved in: {results_dir}") + + +if __name__ == "__main__": + main() diff --git a/build_binary.py b/build_binary.py index b82c48c6e..fd1884a0c 100644 --- a/build_binary.py +++ b/build_binary.py @@ -70,6 +70,9 @@ def build_binary() -> bool: for pkg in cs.PYINSTALLER_PACKAGES: cmd.extend(_build_package_args(pkg)) + for mod in cs.PYINSTALLER_EXCLUDED_MODULES: + cmd.extend([cs.PYINSTALLER_ARG_EXCLUDE_MODULE, mod]) + cmd.append(cs.PYINSTALLER_ENTRY_POINT) logger.info(logs.BUILD_BINARY.format(name=binary_name)) diff --git a/cgr/__init__.py b/cgr/__init__.py new file mode 100644 index 000000000..3d76ac771 --- /dev/null +++ b/cgr/__init__.py @@ -0,0 +1,14 @@ +from codebase_rag.config import settings +from codebase_rag.embedder import embed_code +from codebase_rag.graph_loader import GraphLoader, load_graph +from codebase_rag.services.graph_service import MemgraphIngestor +from codebase_rag.services.llm import CypherGenerator + +__all__ = [ + "CypherGenerator", + "GraphLoader", + "MemgraphIngestor", + "embed_code", + "load_graph", + "settings", +] diff --git a/codebase_rag/cli.py b/codebase_rag/cli.py index 87f9a5379..8b04ccef0 100644 --- a/codebase_rag/cli.py +++ b/codebase_rag/cli.py @@ -1,4 +1,6 @@ import asyncio +from collections.abc import Callable +from importlib.metadata import version as get_version from pathlib import Path import typer @@ -17,6 +19,7 @@ export_graph_to_file, main_async, main_optimize_async, + main_single_query, prompt_for_unignored_directories, style, update_model_settings, @@ -25,15 +28,27 @@ from .services.protobuf_service import ProtobufFileIngestor from .tools.health_checker import HealthChecker from .tools.language import cli as language_cli +from .types_defs import ResultRow app = typer.Typer( - name="code-graph-rag", + name=cs.PACKAGE_NAME, help=ch.APP_DESCRIPTION, no_args_is_help=True, add_completion=False, ) +def _version_callback(value: bool) -> None: + if value: + app_context.console.print( + cs.CLI_MSG_VERSION.format( + package=cs.PACKAGE_NAME, version=get_version(cs.PACKAGE_NAME) + ), + highlight=False, + ) + raise typer.Exit() + + def validate_models_early() -> None: try: orchestrator_config = settings.active_orchestrator_config @@ -58,6 +73,14 @@ def _update_and_validate_models(orchestrator: str | None, cypher: str | None) -> @app.callback() def _global_options( + version: bool | None = typer.Option( + None, + "--version", + "-v", + help=ch.HELP_VERSION, + callback=_version_callback, + is_eager=True, + ), quiet: bool = typer.Option( False, "--quiet", @@ -77,6 +100,18 @@ def _info(msg: str) -> None: app_context.console.print(msg) +def _delete_hash_cache(repo_path: Path) -> None: + cache_path = repo_path / cs.HASH_CACHE_FILENAME + if cache_path.exists(): + _info( + style( + cs.CLI_MSG_CLEANING_HASH_CACHE.format(path=cache_path), + cs.Color.YELLOW, + ) + ) + cache_path.unlink(missing_ok=True) + + @app.command(help=ch.CMD_START) def start( repo_path: str | None = typer.Option( @@ -119,6 +154,11 @@ def start( min=1, help=ch.HELP_BATCH_SIZE, ), + project_name: str | None = typer.Option( + None, + "--project-name", + help=ch.HELP_PROJECT_NAME, + ), exclude: list[str] | None = typer.Option( None, "--exclude", @@ -129,6 +169,12 @@ def start( "--interactive-setup", help=ch.HELP_INTERACTIVE_SETUP, ), + ask_agent: str | None = typer.Option( + None, + "-a", + "--ask-agent", + help=ch.HELP_ASK_AGENT, + ), ) -> None: app_context.session.confirm_edits = not no_confirm @@ -140,10 +186,20 @@ def start( ) raise typer.Exit(1) - _update_and_validate_models(orchestrator, cypher) - effective_batch_size = settings.resolve_batch_size(batch_size) + if clean and not update_graph: + repo_to_clean = Path(target_repo_path) + with connect_memgraph(effective_batch_size) as ingestor: + _info(style(cs.CLI_MSG_CLEANING_DB, cs.Color.YELLOW)) + ingestor.clean_database() + + _delete_hash_cache(repo_to_clean) + _info(style(cs.CLI_MSG_CLEAN_DONE, cs.Color.GREEN)) + return + + _update_and_validate_models(orchestrator, cypher) + if update_graph: repo_to_update = Path(target_repo_path) _info( @@ -164,17 +220,20 @@ def start( if clean: _info(style(cs.CLI_MSG_CLEANING_DB, cs.Color.YELLOW)) ingestor.clean_database() + _delete_hash_cache(repo_to_update) + ingestor.ensure_constraints() parsers, queries = load_parsers() updater = GraphUpdater( - ingestor, - repo_to_update, - parsers, - queries, - unignore_paths, - exclude_paths, + ingestor=ingestor, + repo_path=repo_to_update, + parsers=parsers, + queries=queries, + unignore_paths=unignore_paths, + exclude_paths=exclude_paths, + project_name=project_name, ) updater.run() @@ -187,7 +246,10 @@ def start( return try: - asyncio.run(main_async(target_repo_path, effective_batch_size)) + if ask_agent: + main_single_query(target_repo_path, effective_batch_size, ask_agent) + else: + asyncio.run(main_async(target_repo_path, effective_batch_size)) except KeyboardInterrupt: app_context.console.print(style(cs.CLI_MSG_APP_TERMINATED, cs.Color.RED)) except ValueError as e: @@ -245,7 +307,12 @@ def index( ) parsers, queries = load_parsers() updater = GraphUpdater( - ingestor, repo_to_index, parsers, queries, unignore_paths, exclude_paths + ingestor=ingestor, + repo_path=repo_to_index, + parsers=parsers, + queries=queries, + unignore_paths=unignore_paths, + exclude_paths=exclude_paths, ) updater.run() @@ -357,11 +424,24 @@ def optimize( @app.command(name=ch.CLICommandName.MCP_SERVER, help=ch.CMD_MCP_SERVER) -def mcp_server() -> None: +def mcp_server( + transport: cs.MCPTransport = typer.Option( + cs.MCPTransport.STDIO, help=ch.HELP_MCP_TRANSPORT + ), + host: str = typer.Option(None, help=ch.HELP_MCP_HTTP_HOST), + port: int = typer.Option(None, help=ch.HELP_MCP_HTTP_PORT), +) -> None: try: - from codebase_rag.mcp import main as mcp_main + if transport == cs.MCPTransport.HTTP: + from codebase_rag.mcp import serve_http + + resolved_host = host or settings.MCP_HTTP_HOST + resolved_port = port or settings.MCP_HTTP_PORT + asyncio.run(serve_http(host=resolved_host, port=resolved_port)) + else: + from codebase_rag.mcp import serve_stdio - asyncio.run(mcp_main()) + asyncio.run(serve_stdio()) except KeyboardInterrupt: app_context.console.print(style(cs.CLI_MSG_APP_TERMINATED, cs.Color.RED)) except ValueError as e: @@ -369,7 +449,6 @@ def mcp_server() -> None: style(cs.CLI_ERR_CONFIG.format(error=e), cs.Color.RED) ) _info(style(cs.CLI_MSG_HINT_TARGET_REPO, cs.Color.YELLOW)) - except Exception as e: app_context.console.print( style(cs.CLI_ERR_MCP_SERVER.format(error=e), cs.Color.RED) @@ -465,5 +544,75 @@ def doctor() -> None: raise typer.Exit(1) +def _build_stats_table( + title: str, + col_label: str, + rows: list[ResultRow], + get_label: Callable[[ResultRow], str], + total_label: str, +) -> Table: + table = Table( + title=style(title, cs.Color.GREEN), + show_header=True, + header_style=f"{cs.StyleModifier.BOLD} {cs.Color.MAGENTA}", + ) + table.add_column(col_label, style=cs.Color.CYAN) + table.add_column(cs.CLI_STATS_COL_COUNT, style=cs.Color.YELLOW, justify="right") + total = 0 + for row in rows: + raw_count = row.get("count", 0) + count = int(raw_count) if isinstance(raw_count, (int, float)) else 0 + total += count + table.add_row(get_label(row), f"{count:,}") + table.add_section() + table.add_row( + style(total_label, cs.Color.GREEN), + style(f"{total:,}", cs.Color.GREEN), + ) + return table + + +@app.command(name=ch.CLICommandName.STATS, help=ch.CMD_STATS) +def stats() -> None: + from .cypher_queries import ( + CYPHER_STATS_NODE_COUNTS, + CYPHER_STATS_RELATIONSHIP_COUNTS, + ) + + app_context.console.print(style(cs.CLI_MSG_CONNECTING_STATS, cs.Color.CYAN)) + + try: + with connect_memgraph(batch_size=1) as ingestor: + node_results = ingestor.fetch_all(CYPHER_STATS_NODE_COUNTS) + rel_results = ingestor.fetch_all(CYPHER_STATS_RELATIONSHIP_COUNTS) + + app_context.console.print( + _build_stats_table( + cs.CLI_STATS_NODE_TITLE, + cs.CLI_STATS_COL_NODE_TYPE, + node_results, + lambda r: ":".join(r.get("labels", [])) or cs.CLI_STATS_UNKNOWN, + cs.CLI_STATS_TOTAL_NODES, + ) + ) + app_context.console.print() + app_context.console.print( + _build_stats_table( + cs.CLI_STATS_REL_TITLE, + cs.CLI_STATS_COL_REL_TYPE, + rel_results, + lambda r: str(r.get("type", cs.CLI_STATS_UNKNOWN)), + cs.CLI_STATS_TOTAL_RELS, + ) + ) + + except Exception as e: + app_context.console.print( + style(cs.CLI_ERR_STATS_FAILED.format(error=e), cs.Color.RED) + ) + logger.exception(ls.STATS_ERROR.format(error=e)) + raise typer.Exit(1) from e + + if __name__ == "__main__": app() diff --git a/codebase_rag/cli_help.py b/codebase_rag/cli_help.py index 96e816d9a..1e3751524 100644 --- a/codebase_rag/cli_help.py +++ b/codebase_rag/cli_help.py @@ -10,6 +10,7 @@ class CLICommandName(StrEnum): GRAPH_LOADER = "graph-loader" LANGUAGE = "language" DOCTOR = "doctor" + STATS = "stats" APP_DESCRIPTION = ( @@ -26,6 +27,7 @@ class CLICommandName(StrEnum): CMD_GRAPH_LOADER = "Load and display summary of exported graph JSON" CMD_LANGUAGE = "Manage language grammars (add, remove, list)" CMD_DOCTOR = "Verify that all dependencies and configurations are properly set up" +CMD_STATS = "Display node and relationship statistics for the indexed graph" CMD_LANGUAGE_GROUP = "CLI for managing language grammars" CMD_LANGUAGE_ADD = "Add a new language grammar to the project." @@ -38,11 +40,11 @@ class CLICommandName(StrEnum): HELP_MEMGRAPH_PORT = "Memgraph port" HELP_ORCHESTRATOR = ( "Specify orchestrator as provider:model " - "(e.g., ollama:llama3.2, openai:gpt-4, google:gemini-2.5-pro)" + "(e.g., ollama:llama3.2, openai:gpt-4, google:gemini-3.1-pro-preview)" ) HELP_CYPHER_MODEL = ( "Specify cypher model as provider:model " - "(e.g., ollama:codellama, google:gemini-2.5-flash)" + "(e.g., ollama:codellama, google:gemini-3-flash-preview)" ) HELP_NO_CONFIRM = "Disable confirmation prompts for edit operations (YOLO mode)" @@ -50,6 +52,12 @@ class CLICommandName(StrEnum): HELP_REPO_PATH_INDEX = "Path to the target repository to index." HELP_REPO_PATH_OPTIMIZE = "Path to the repository to optimize" HELP_REPO_PATH_WATCH = "Path to the repository to watch." +HELP_VERSION = "Show the version and exit." + +HELP_DEBOUNCE = "Debounce delay in seconds. Set to 0 to disable debouncing." +HELP_MAX_WAIT = ( + "Maximum wait time in seconds before forcing an update during continuous edits." +) HELP_UPDATE_GRAPH = "Update the knowledge graph by parsing the repository" HELP_CLEAN_DB = "Clean the database before updating (use when adding first repo)" @@ -73,6 +81,10 @@ class CLICommandName(StrEnum): ) HELP_KEEP_SUBMODULE = "Keep the git submodule (default: remove it)" +HELP_PROJECT_NAME = ( + "Override the project name used as qualified-name prefix for all nodes. " + "Defaults to the repo directory name." +) HELP_EXCLUDE_PATTERNS = ( "Additional directories to exclude from indexing. Can be specified multiple times." ) @@ -81,6 +93,19 @@ class CLICommandName(StrEnum): "Without this flag, all directories matching ignore patterns are automatically excluded." ) +HELP_ASK_AGENT = ( + "Run a single query in non-interactive mode and exit. " + "Output is sent to stdout, useful for scripting." +) + +HELP_MCP_TRANSPORT = "Transport mode: 'stdio' (default) or 'http'" +HELP_MCP_HTTP_HOST = ( + "Host to bind the HTTP server — only used when --transport http (default: 0.0.0.0)" +) +HELP_MCP_HTTP_PORT = ( + "Port to bind the HTTP server — only used when --transport http (default: 8080)" +) + CLI_COMMANDS: dict[CLICommandName, str] = { CLICommandName.START: CMD_START, CLICommandName.INDEX: CMD_INDEX, @@ -90,4 +115,5 @@ class CLICommandName(StrEnum): CLICommandName.GRAPH_LOADER: CMD_GRAPH_LOADER, CLICommandName.LANGUAGE: CMD_LANGUAGE, CLICommandName.DOCTOR: CMD_DOCTOR, + CLICommandName.STATS: CMD_STATS, } diff --git a/codebase_rag/config.py b/codebase_rag/config.py index 31848e4d1..d49f3948c 100644 --- a/codebase_rag/config.py +++ b/codebase_rag/config.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from dataclasses import asdict, dataclass from pathlib import Path from typing import TypedDict, Unpack @@ -44,11 +45,6 @@ class ApiKeyInfoEntry(TypedDict): "url": "https://portal.azure.com/", "name": "Azure OpenAI", }, - cs.Provider.COHERE: { - "env_var": "COHERE_API_KEY", - "url": "https://dashboard.cohere.com/api-keys", - "name": "Cohere", - }, } @@ -94,6 +90,9 @@ def format_missing_api_key_errors( return error_msg +LOCAL_PROVIDERS = frozenset({cs.Provider.OLLAMA}) + + @dataclass class ModelConfig: provider: str @@ -113,8 +112,20 @@ def to_update_kwargs(self) -> ModelConfigKwargs: return ModelConfigKwargs(**result) def validate_api_key(self, role: str = cs.DEFAULT_MODEL_ROLE) -> None: - local_providers = {cs.Provider.OLLAMA, cs.Provider.LOCAL, cs.Provider.VLLM} - if self.provider.lower() in local_providers: + provider_lower = self.provider.lower() + provider_env_keys = { + cs.Provider.ANTHROPIC: cs.ENV_ANTHROPIC_API_KEY, + cs.Provider.AZURE: cs.ENV_AZURE_API_KEY, + } + env_key = provider_env_keys.get(provider_lower) + if ( + provider_lower in LOCAL_PROVIDERS + or ( + provider_lower == cs.Provider.GOOGLE + and self.provider_type == cs.GoogleProviderType.VERTEX + ) + or (env_key and os.environ.get(env_key)) + ): return if ( not self.api_key @@ -139,6 +150,8 @@ class AppConfig(BaseSettings): MEMGRAPH_HOST: str = "localhost" MEMGRAPH_PORT: int = 7687 MEMGRAPH_HTTP_PORT: int = 7444 + MEMGRAPH_USERNAME: str | None = None + MEMGRAPH_PASSWORD: str | None = None LAB_PORT: int = 3000 MEMGRAPH_BATCH_SIZE: int = 1000 AGENT_RETRIES: int = 3 @@ -150,7 +163,7 @@ class AppConfig(BaseSettings): ORCHESTRATOR_ENDPOINT: str | None = None ORCHESTRATOR_PROJECT_ID: str | None = None ORCHESTRATOR_REGION: str = cs.DEFAULT_REGION - ORCHESTRATOR_PROVIDER_TYPE: str | None = None + ORCHESTRATOR_PROVIDER_TYPE: cs.GoogleProviderType | None = None ORCHESTRATOR_THINKING_BUDGET: int | None = None ORCHESTRATOR_SERVICE_ACCOUNT_FILE: str | None = None @@ -160,7 +173,7 @@ class AppConfig(BaseSettings): CYPHER_ENDPOINT: str | None = None CYPHER_PROJECT_ID: str | None = None CYPHER_REGION: str = cs.DEFAULT_REGION - CYPHER_PROVIDER_TYPE: str | None = None + CYPHER_PROVIDER_TYPE: cs.GoogleProviderType | None = None CYPHER_THINKING_BUDGET: int | None = None CYPHER_SERVICE_ACCOUNT_FILE: str | None = None @@ -238,21 +251,35 @@ def ollama_endpoint(self) -> str: QDRANT_COLLECTION_NAME: str = "code_embeddings" QDRANT_VECTOR_DIM: int = 768 QDRANT_TOP_K: int = 5 + QDRANT_UPSERT_RETRIES: int = Field(default=3, gt=0) + QDRANT_RETRY_BASE_DELAY: float = Field(default=0.5, gt=0) + QDRANT_BATCH_SIZE: int = Field(default=50, gt=0) EMBEDDING_MAX_LENGTH: int = 512 EMBEDDING_PROGRESS_INTERVAL: int = 10 + FLUSH_THREAD_POOL_SIZE: int = Field(default=4, gt=0) + FILE_FLUSH_INTERVAL: int = Field(default=500, gt=0) + CACHE_MAX_ENTRIES: int = 1000 CACHE_MAX_MEMORY_MB: int = 500 CACHE_EVICTION_DIVISOR: int = 10 CACHE_MEMORY_THRESHOLD_RATIO: float = 0.8 + QUERY_RESULT_MAX_TOKENS: int = Field(default=16000, gt=0) + QUERY_RESULT_ROW_CAP: int = Field(default=500, gt=0) + OLLAMA_HEALTH_TIMEOUT: float = 5.0 + LITELLM_HEALTH_TIMEOUT: float = 5.0 _active_orchestrator: ModelConfig | None = None _active_cypher: ModelConfig | None = None QUIET: bool = Field(False, validation_alias="CGR_QUIET") + MCP_HTTP_HOST: str = "0.0.0.0" + MCP_HTTP_PORT: int = 8080 + MCP_HTTP_ENDPOINT_PATH: str = "/mcp" + def _get_default_config(self, role: str) -> ModelConfig: role_upper = role.upper() diff --git a/codebase_rag/constants.py b/codebase_rag/constants.py index 4ef971d8a..f1ad47324 100644 --- a/codebase_rag/constants.py +++ b/codebase_rag/constants.py @@ -20,9 +20,7 @@ class Provider(StrEnum): OPENAI = "openai" GOOGLE = "google" AZURE = "azure" - COHERE = "cohere" - LOCAL = "local" - VLLM = "vllm" + LITELLM_PROXY = "litellm_proxy" class Color(StrEnum): @@ -89,6 +87,7 @@ class FileAction(StrEnum): EXT_IXX = ".ixx" EXT_CPPM = ".cppm" EXT_CCM = ".ccm" +EXT_C = ".c" EXT_CS = ".cs" EXT_PHP = ".php" EXT_LUA = ".lua" @@ -101,6 +100,7 @@ class FileAction(StrEnum): GO_EXTENSIONS = (EXT_GO,) SCALA_EXTENSIONS = (EXT_SCALA, EXT_SC) JAVA_EXTENSIONS = (EXT_JAVA,) +C_EXTENSIONS = (EXT_C,) CPP_EXTENSIONS = ( EXT_CPP, EXT_H, @@ -131,6 +131,10 @@ class FileAction(StrEnum): ENV_OPENAI_API_KEY = "OPENAI_API_KEY" ENV_GOOGLE_API_KEY = "GOOGLE_API_KEY" +ENV_ANTHROPIC_API_KEY = "ANTHROPIC_API_KEY" +ENV_AZURE_API_KEY = "AZURE_API_KEY" +ENV_AZURE_ENDPOINT = "AZURE_OPENAI_ENDPOINT" +ENV_AZURE_API_VERSION = "AZURE_API_VERSION" HELP_ARG = "help" @@ -150,6 +154,8 @@ class GoogleProviderType(StrEnum): HTTP_OK = 200 UNIXCODER_MODEL = "microsoft/unixcoder-base" +EMBEDDING_DEFAULT_BATCH_SIZE = 32 +EMBEDDING_CACHE_FILENAME = ".embedding_cache.json" KEY_NODES = "nodes" KEY_RELATIONSHIPS = "relationships" @@ -171,6 +177,7 @@ class GoogleProviderType(StrEnum): KEY_START_LINE = "start_line" KEY_END_LINE = "end_line" KEY_PATH = "path" +KEY_ABSOLUTE_PATH = "absolute_path" KEY_EXTENSION = "extension" KEY_MODULE_TYPE = "module_type" KEY_IMPLEMENTS_MODULE = "implements_module" @@ -209,6 +216,10 @@ class GoogleProviderType(StrEnum): ONEOF_EXTERNAL_PACKAGE = "external_package" ONEOF_MODULE_IMPLEMENTATION = "module_implementation" ONEOF_MODULE_INTERFACE = "module_interface" +ONEOF_INTERFACE = "interface_node" +ONEOF_ENUM = "enum_node" +ONEOF_TYPE = "type_node" +ONEOF_UNION = "union_node" # (H) CLI error and info messages CLI_ERR_OUTPUT_REQUIRES_UPDATE = ( @@ -224,6 +235,8 @@ class GoogleProviderType(StrEnum): CLI_MSG_UPDATING_GRAPH = "Updating knowledge graph for: {path}" CLI_MSG_CLEANING_DB = "Cleaning database..." +CLI_MSG_CLEANING_HASH_CACHE = "Removing hash cache: {path}" +CLI_MSG_CLEAN_DONE = "Clean completed successfully!" CLI_MSG_EXPORTING_TO = "Exporting graph to: {path}" CLI_MSG_GRAPH_UPDATED = "Graph update completed!" CLI_MSG_APP_TERMINATED = "\nApplication terminated by user." @@ -234,10 +247,22 @@ class GoogleProviderType(StrEnum): CLI_MSG_EXPORTING_DATA = "Exporting graph data..." CLI_MSG_OPTIMIZATION_TERMINATED = "\nOptimization session terminated by user." CLI_MSG_MCP_TERMINATED = "\nMCP server terminated by user." +PACKAGE_NAME = "code-graph-rag" +CLI_MSG_VERSION = "{package} version {version}" CLI_MSG_HINT_TARGET_REPO = ( "\nHint: Make sure TARGET_REPO_PATH environment variable is set." ) CLI_MSG_GRAPH_SUMMARY = "Graph Summary:" +CLI_MSG_CONNECTING_STATS = "Fetching graph statistics..." +CLI_STATS_NODE_TITLE = "Node Statistics" +CLI_STATS_REL_TITLE = "Relationship Statistics" +CLI_STATS_COL_NODE_TYPE = "Node Type" +CLI_STATS_COL_REL_TYPE = "Relationship Type" +CLI_STATS_COL_COUNT = "Count" +CLI_STATS_TOTAL_NODES = "Total Nodes" +CLI_STATS_TOTAL_RELS = "Total Relationships" +CLI_STATS_UNKNOWN = "Unknown" +CLI_ERR_STATS_FAILED = "Failed to get graph statistics: {error}" CLI_MSG_AUTO_EXCLUDE = ( "Auto-excluding common directories (venv, node_modules, .git, etc.). " "Use --interactive-setup to customize." @@ -268,7 +293,7 @@ class GoogleProviderType(StrEnum): UI_MODEL_SWITCHED = "[bold green]Model switched to: {model}[/bold green]" UI_MODEL_CURRENT = "[bold cyan]Current model: {model}[/bold cyan]" UI_MODEL_SWITCH_ERROR = "[bold red]Failed to switch model: {error}[/bold red]" -UI_MODEL_USAGE = "[bold yellow]Usage: /model (e.g., /model google:gemini-2.0-flash)[/bold yellow]" +UI_MODEL_USAGE = "[bold yellow]Usage: /model (e.g., /model google:gemini-3.1-pro-preview)[/bold yellow]" UI_HELP_COMMANDS = """[bold cyan]Available commands:[/bold cyan] /model - Switch to a different model /model - Show current model @@ -417,14 +442,21 @@ class RelationshipType(StrEnum): # (H) Cypher queries CYPHER_DEFAULT_LIMIT = 50 -CYPHER_QUERY_EMBEDDINGS = """ +_CYPHER_EMBEDDING_BASE = """ MATCH (m:Module)-[:DEFINES]->(n) WHERE (n:Function OR n:Method) - AND m.qualified_name STARTS WITH $project_name + '.' -RETURN id(n) AS node_id, n.qualified_name AS qualified_name, + AND m.qualified_name STARTS WITH ($project_name + '.') +""" + +CYPHER_QUERY_EMBEDDINGS = ( + _CYPHER_EMBEDDING_BASE + + """RETURN id(n) AS node_id, n.qualified_name AS qualified_name, n.start_line AS start_line, n.end_line AS end_line, m.path AS path """ +) + +CYPHER_QUERY_PROJECT_NODE_IDS = _CYPHER_EMBEDDING_BASE + "RETURN id(n) AS node_id\n" class SupportedLanguage(StrEnum): @@ -435,6 +467,7 @@ class SupportedLanguage(StrEnum): GO = "go" SCALA = "scala" JAVA = "java" + C = "c" CPP = "cpp" CSHARP = "c-sharp" PHP = "php" @@ -468,6 +501,11 @@ class LanguageMetadata(NamedTuple): "Interfaces, type aliases, enums, namespaces, ES6/CommonJS modules", "TypeScript", ), + SupportedLanguage.C: LanguageMetadata( + LanguageStatus.FULL, + "Functions, structs, unions, enums, preprocessor includes", + "C", + ), SupportedLanguage.CPP: LanguageMetadata( LanguageStatus.FULL, "Constructors, destructors, operator overloading, templates, lambdas, C++20 modules, namespaces", @@ -504,8 +542,8 @@ class LanguageMetadata(NamedTuple): "C#", ), SupportedLanguage.PHP: LanguageMetadata( - LanguageStatus.DEV, - "Classes, functions, namespaces", + LanguageStatus.FULL, + "Classes, interfaces, traits, enums, namespaces, PHP 8 attributes", "PHP", ), } @@ -723,6 +761,7 @@ class DiffMarker: INPLACE_FLAG = "--inplace" LANG_ATTR_PREFIX = "language_" LANG_ATTR_TYPESCRIPT = "language_typescript" +LANG_ATTR_PHP = "language_php" class TreeSitterModule(StrEnum): @@ -733,8 +772,10 @@ class TreeSitterModule(StrEnum): GO = "tree_sitter_go" SCALA = "tree_sitter_scala" JAVA = "tree_sitter_java" + C = "tree_sitter_c" CPP = "tree_sitter_cpp" LUA = "tree_sitter_lua" + PHP = "tree_sitter_php" # (H) Query dict keys @@ -839,11 +880,26 @@ class TreeSitterModule(StrEnum): class EventType(StrEnum): MODIFIED = "modified" CREATED = "created" + DELETED = "deleted" CYPHER_DELETE_MODULE = "MATCH (m:Module {path: $path})-[*0..]->(c) DETACH DELETE m, c" +CYPHER_DELETE_FILE = "MATCH (f:File {path: $path}) DETACH DELETE f" +CYPHER_DELETE_FOLDER = "MATCH (f:Folder {path: $path}) DETACH DELETE f" CYPHER_DELETE_CALLS = "MATCH ()-[r:CALLS]->() DELETE r" +# (H) Queries for orphan pruning — returns all paths stored in the graph +CYPHER_ALL_FILE_PATHS = ( + "MATCH (f:File) RETURN f.path AS path, f.absolute_path AS absolute_path" +) +CYPHER_ALL_MODULE_PATHS_INTERNAL = ( + "MATCH (m:Module) WHERE m.is_external IS NULL OR m.is_external = false " + "RETURN m.path AS path, m.qualified_name AS qualified_name" +) +CYPHER_ALL_FOLDER_PATHS = ( + "MATCH (f:Folder) RETURN f.path AS path, f.absolute_path AS absolute_path" +) + REALTIME_LOGGER_FORMAT = ( "{time:YYYY-MM-DD HH:mm:ss.SSS} | " "{level: <8} | " @@ -853,6 +909,11 @@ class EventType(StrEnum): WATCHER_SLEEP_INTERVAL = 1 LOG_LEVEL_INFO = "INFO" +LOG_LEVEL_ERROR = "ERROR" + +# (H) Debounce settings for realtime watcher +DEFAULT_DEBOUNCE_SECONDS = 5 +DEFAULT_MAX_WAIT_SECONDS = 30 class Architecture(StrEnum): @@ -880,8 +941,11 @@ class Architecture(StrEnum): PYINSTALLER_ARG_COLLECT_ALL = "--collect-all" PYINSTALLER_ARG_COLLECT_DATA = "--collect-data" PYINSTALLER_ARG_HIDDEN_IMPORT = "--hidden-import" +PYINSTALLER_ARG_EXCLUDE_MODULE = "--exclude-module" PYINSTALLER_ENTRY_POINT = "main.py" +PYINSTALLER_EXCLUDED_MODULES = ["logfire"] + # (H) TOML parsing constants TOML_KEY_PROJECT = "project" TOML_KEY_OPTIONAL_DEPS = "optional-dependencies" @@ -905,6 +969,7 @@ class Architecture(StrEnum): PyInstallerPackage(name="loguru", collect_all=True), PyInstallerPackage(name="toml", collect_all=True), PyInstallerPackage(name="protobuf", collect_all=True), + PyInstallerPackage(name="genai_prices", collect_all=True), ] ALLOWED_COMMENT_MARKERS = frozenset( @@ -961,6 +1026,22 @@ class UniXcoderMode(StrEnum): CYPHER_SEMICOLON = ";" CYPHER_BACKTICK = "`" CYPHER_MATCH_KEYWORD = "MATCH" +CYPHER_DANGEROUS_KEYWORDS: frozenset[str] = frozenset( + { + "DELETE", + "DETACH", + "DROP", + "CREATE INDEX", + "CREATE CONSTRAINT", + "REMOVE", + "SET", + "MERGE", + "CREATE", + "CALL", + "LOAD CSV", + "FOREACH", + } +) # (H) Tool success messages MSG_SURGICAL_SUCCESS = "Successfully applied surgical code replacement in: {path}" @@ -1105,7 +1186,12 @@ class UniXcoderMode(StrEnum): # (H) Query tool messages QUERY_NOT_AVAILABLE = "N/A" DICT_KEY_RESULTS = "results" +TIKTOKEN_ENCODING = "cl100k_base" QUERY_SUMMARY_SUCCESS = "Successfully retrieved {count} item(s) from the graph." +QUERY_SUMMARY_TRUNCATED = ( + "Results truncated: showing {kept} of {total} items (~{tokens} tokens, limit {max_tokens}). " + "Refine your query for more specific results." +) QUERY_SUMMARY_TRANSLATION_FAILED = ( "I couldn't translate your request into a database query. Error: {error}" ) @@ -1569,6 +1655,9 @@ class CppNodeType(StrEnum): # (H) Gemfile parsing patterns GEMFILE_GEM_PREFIX = "gem " +# (H) Incremental update hash cache +HASH_CACHE_FILENAME = ".cgr-hash-cache.json" + # (H) Import processor cache config IMPORT_CACHE_TTL = 3600 IMPORT_CACHE_DIR = ".cache/codebase_rag" @@ -1696,6 +1785,8 @@ class CppNodeType(StrEnum): TS_CS_INVOCATION_EXPRESSION = "invocation_expression" # (H) Tree-sitter PHP node types +TS_PHP_FUNCTION_DEFINITION = "function_definition" +TS_PHP_METHOD_DECLARATION = "method_declaration" TS_PHP_TRAIT_DECLARATION = "trait_declaration" TS_PHP_FUNCTION_STATIC_DECLARATION = "function_static_declaration" TS_PHP_ANONYMOUS_FUNCTION = "anonymous_function" @@ -1704,6 +1795,20 @@ class CppNodeType(StrEnum): TS_PHP_SCOPED_CALL_EXPRESSION = "scoped_call_expression" TS_PHP_FUNCTION_CALL_EXPRESSION = "function_call_expression" TS_PHP_NULLSAFE_MEMBER_CALL_EXPRESSION = "nullsafe_member_call_expression" +TS_PHP_OBJECT_CREATION_EXPRESSION = "object_creation_expression" +TS_PHP_NAMESPACE_DEFINITION = "namespace_definition" +TS_PHP_NAMESPACE_USE_DECLARATION = "namespace_use_declaration" +TS_PHP_NAMESPACE_USE_CLAUSE = "namespace_use_clause" +TS_PHP_INCLUDE_EXPRESSION = "include_expression" +TS_PHP_INCLUDE_ONCE_EXPRESSION = "include_once_expression" +TS_PHP_REQUIRE_EXPRESSION = "require_expression" +TS_PHP_REQUIRE_ONCE_EXPRESSION = "require_once_expression" +TS_PHP_ATTRIBUTE_LIST = "attribute_list" +TS_PHP_ATTRIBUTE = "attribute" +TS_PHP_ATTRIBUTE_GROUP = "attribute_group" +TS_PHP_VISIBILITY_MODIFIER = "visibility_modifier" +TS_PHP_USE_DECLARATION = "use_declaration" +TS_PHP_QUALIFIED_NAME = "qualified_name" # (H) Tree-sitter Lua node types for language_spec TS_LUA_CHUNK = "chunk" @@ -1827,6 +1932,20 @@ class CppNodeType(StrEnum): } ) +# (H) Java stdlib package prefixes for static stdlib detection +JAVA_STDLIB_PREFIXES = ( + "java.", + "javax.", + "jdk.", + "com.sun.", + "sun.", + "org.w3c.", + "org.xml.", + "org.ietf.", + "org.omg.", + "netscape.", +) + # (H) Java common class names for heuristic detection JAVA_STDLIB_CLASSES = frozenset( { @@ -2134,8 +2253,9 @@ class CppNodeType(StrEnum): TYPE_INFERENCE_LIST = "list" TYPE_INFERENCE_BASE_MODEL = "BaseModel" -# (H) Type inference guard attribute +# (H) Recursion guard attributes ATTR_TYPE_INFERENCE_IN_PROGRESS = "_type_inference_in_progress" +GUARD_INHERITED_METHOD = "_inherited_method_guard" # (H) JS/TS ingest node types TS_PAIR = "pair" @@ -2355,12 +2475,21 @@ class MCPToolName(StrEnum): DELETE_PROJECT = "delete_project" WIPE_DATABASE = "wipe_database" INDEX_REPOSITORY = "index_repository" + UPDATE_REPOSITORY = "update_repository" QUERY_CODE_GRAPH = "query_code_graph" GET_CODE_SNIPPET = "get_code_snippet" SURGICAL_REPLACE_CODE = "surgical_replace_code" READ_FILE = "read_file" WRITE_FILE = "write_file" LIST_DIRECTORY = "list_directory" + SEMANTIC_SEARCH = "semantic_search" + ASK_AGENT = "ask_agent" + + +# (H) MCP transport selection +class MCPTransport(StrEnum): + STDIO = "stdio" + HTTP = "http" # (H) MCP environment variables @@ -2400,6 +2529,8 @@ class MCPParamName(StrEnum): LIMIT = "limit" CONTENT = "content" DIRECTORY_PATH = "directory_path" + TOP_K = "top_k" + QUESTION = "question" # (H) MCP server constants @@ -2418,6 +2549,12 @@ class MCPParamName(StrEnum): MCP_WRITE_SUCCESS = "Successfully wrote file: {path}" MCP_UNKNOWN_TOOL_ERROR = "Unknown tool: {name}" MCP_TOOL_EXEC_ERROR = "Error executing tool '{name}': {error}" +MCP_UPDATE_SUCCESS = "Successfully updated repository at {path} (no database wipe)." +MCP_UPDATE_ERROR = "Error updating repository: {error}" +MCP_SEMANTIC_NOT_AVAILABLE_RESPONSE = ( + "Semantic search is not available. Install with: uv sync --extra semantic" +) +MCP_ASK_AGENT_ERROR = "Error running ask_agent: {error}" MCP_PROJECT_DELETED = "Successfully deleted project '{project_name}'." MCP_WIPE_CANCELLED = "Database wipe cancelled. Set confirm=true to proceed." MCP_WIPE_SUCCESS = "Database completely wiped. All projects have been removed." @@ -2572,13 +2709,14 @@ class MCPParamName(StrEnum): TS_CLASS_DECLARATION, TS_INTERFACE_DECLARATION, TS_PHP_TRAIT_DECLARATION, + TS_PHP_NAMESPACE_DEFINITION, TS_PROGRAM, ) FQN_PHP_FUNCTION_TYPES = ( - TS_PY_FUNCTION_DEFINITION, + TS_PHP_FUNCTION_DEFINITION, + TS_PHP_METHOD_DECLARATION, TS_PHP_ANONYMOUS_FUNCTION, TS_PHP_ARROW_FUNCTION, - TS_PHP_FUNCTION_STATIC_DECLARATION, ) # (H) LANGUAGE_SPECS node type tuples for Python @@ -2617,6 +2755,13 @@ class MCPParamName(StrEnum): TS_ENUM_SPECIFIER, ) +# (H) Derived node types for _c_get_name +C_NAME_NODE_TYPES = ( + TS_STRUCT_SPECIFIER, + TS_UNION_SPECIFIER, + TS_ENUM_SPECIFIER, +) + # (H) LANGUAGE_SPECS node type tuples for Rust SPEC_RS_FUNCTION_TYPES = ( TS_RS_FUNCTION_ITEM, @@ -2713,6 +2858,26 @@ class MCPParamName(StrEnum): PKG_CONANFILE, ) +# (H) FQN node type tuples for C +FQN_C_SCOPE_TYPES = ( + TS_CPP_TRANSLATION_UNIT, + TS_STRUCT_SPECIFIER, + TS_UNION_SPECIFIER, + TS_ENUM_SPECIFIER, +) +FQN_C_FUNCTION_TYPES = (TS_CPP_FUNCTION_DEFINITION,) + +# (H) LANGUAGE_SPECS node type tuples for C +SPEC_C_FUNCTION_TYPES = (TS_CPP_FUNCTION_DEFINITION,) +SPEC_C_CLASS_TYPES = ( + TS_STRUCT_SPECIFIER, + TS_UNION_SPECIFIER, + TS_ENUM_SPECIFIER, +) +SPEC_C_MODULE_TYPES = (TS_CPP_TRANSLATION_UNIT,) +SPEC_C_CALL_TYPES = (TS_CPP_CALL_EXPRESSION,) +SPEC_C_PACKAGE_INDICATORS = (PKG_CMAKE_LISTS, PKG_MAKEFILE) + # (H) LANGUAGE_SPECS node type tuples for C# SPEC_CS_FUNCTION_TYPES = ( TS_CS_DESTRUCTOR_DECLARATION, @@ -2734,23 +2899,31 @@ class MCPParamName(StrEnum): # (H) LANGUAGE_SPECS node type tuples for PHP SPEC_PHP_FUNCTION_TYPES = ( - TS_PHP_FUNCTION_STATIC_DECLARATION, + TS_PHP_FUNCTION_DEFINITION, + TS_PHP_METHOD_DECLARATION, TS_PHP_ANONYMOUS_FUNCTION, - TS_PY_FUNCTION_DEFINITION, TS_PHP_ARROW_FUNCTION, ) SPEC_PHP_CLASS_TYPES = ( + TS_CLASS_DECLARATION, + TS_INTERFACE_DECLARATION, TS_PHP_TRAIT_DECLARATION, TS_ENUM_DECLARATION, - TS_INTERFACE_DECLARATION, - TS_CLASS_DECLARATION, ) SPEC_PHP_MODULE_TYPES = (TS_PROGRAM,) SPEC_PHP_CALL_TYPES = ( + TS_PHP_FUNCTION_CALL_EXPRESSION, TS_PHP_MEMBER_CALL_EXPRESSION, TS_PHP_SCOPED_CALL_EXPRESSION, - TS_PHP_FUNCTION_CALL_EXPRESSION, TS_PHP_NULLSAFE_MEMBER_CALL_EXPRESSION, + TS_PHP_OBJECT_CREATION_EXPRESSION, +) +SPEC_PHP_IMPORT_TYPES = (TS_PHP_NAMESPACE_USE_DECLARATION,) +SPEC_PHP_IMPORT_FROM_TYPES = ( + TS_PHP_INCLUDE_EXPRESSION, + TS_PHP_INCLUDE_ONCE_EXPRESSION, + TS_PHP_REQUIRE_EXPRESSION, + TS_PHP_REQUIRE_ONCE_EXPRESSION, ) # (H) LANGUAGE_SPECS node type tuples for Lua diff --git a/codebase_rag/cypher_queries.py b/codebase_rag/cypher_queries.py index 8d70bae4e..d441b9c47 100644 --- a/codebase_rag/cypher_queries.py +++ b/codebase_rag/cypher_queries.py @@ -52,8 +52,8 @@ CYPHER_EXAMPLE_LIMIT_ONE = """MATCH (f:File) RETURN f.path as path, f.name as name, labels(f) as type LIMIT 1""" CYPHER_EXAMPLE_CLASS_METHODS = f"""MATCH (c:Class)-[:DEFINES_METHOD]->(m:Method) -WHERE c.qualified_name ENDS WITH '.UserService' -RETURN m.name AS name, m.qualified_name AS qualified_name, labels(m) AS type +WHERE c.name = 'UserService' +RETURN c.name AS className, m.name AS methodName, m.qualified_name AS qualified_name, labels(m) AS type LIMIT {CYPHER_DEFAULT_LIMIT}""" CYPHER_EXPORT_NODES = """ @@ -84,6 +84,19 @@ """ +CYPHER_STATS_NODE_COUNTS = """ +MATCH (n) +RETURN labels(n) AS labels, count(*) AS count +ORDER BY count DESC +""" + +CYPHER_STATS_RELATIONSHIP_COUNTS = """ +MATCH ()-[r]->() +RETURN type(r) AS type, count(*) AS count +ORDER BY count DESC +""" + + def wrap_with_unwind(query: str) -> str: return f"UNWIND $batch AS row\n{query}" @@ -126,3 +139,24 @@ def build_merge_relationship_query( ) query += CYPHER_SET_PROPS_RETURN_COUNT if has_props else CYPHER_RETURN_COUNT return query + + +def build_create_node_query(label: str, id_key: str) -> str: + return f"CREATE (n:{label} {{{id_key}: row.id}})\nSET n += row.props" + + +def build_create_relationship_query( + from_label: str, + from_key: str, + rel_type: str, + to_label: str, + to_key: str, + has_props: bool = False, +) -> str: + query = ( + f"MATCH (a:{from_label} {{{from_key}: row.from_val}}), " + f"(b:{to_label} {{{to_key}: row.to_val}})\n" + f"CREATE (a)-[r:{rel_type}]->(b)\n" + ) + query += CYPHER_SET_PROPS_RETURN_COUNT if has_props else CYPHER_RETURN_COUNT + return query diff --git a/codebase_rag/embedder.py b/codebase_rag/embedder.py index 0928cae97..d9d126645 100644 --- a/codebase_rag/embedder.py +++ b/codebase_rag/embedder.py @@ -1,19 +1,96 @@ -# ┌────────────────────────────────────────────────────────────────────────┐ -# │ UniXcoder Model Singleton via LRU Cache │ -# ├────────────────────────────────────────────────────────────────────────┤ -# │ get_model() provides: │ -# │ - Singleton behavior without global variables │ -# │ - Thread-safe lazy initialization │ -# │ - Easy testability with cache_clear() method │ -# │ - Memory efficient with maxsize=1 │ -# └────────────────────────────────────────────────────────────────────────┘ +from __future__ import annotations + +import hashlib +import json from functools import lru_cache +from pathlib import Path + +from loguru import logger +from . import constants as cs from . import exceptions as ex +from . import logs as ls from .config import settings -from .constants import UNIXCODER_MODEL from .utils.dependencies import has_torch, has_transformers + +class EmbeddingCache: + __slots__ = ("_cache", "_path") + + def __init__(self, path: Path | None = None) -> None: + self._cache: dict[str, list[float]] = {} + self._path = path + + @staticmethod + def _content_hash(content: str) -> str: + return hashlib.sha256(content.encode()).hexdigest() + + def get(self, content: str) -> list[float] | None: + return self._cache.get(self._content_hash(content)) + + def put(self, content: str, embedding: list[float]) -> None: + self._cache[self._content_hash(content)] = embedding + + def get_many(self, snippets: list[str]) -> dict[int, list[float]]: + results: dict[int, list[float]] = {} + for i, snippet in enumerate(snippets): + if (cached := self.get(snippet)) is not None: + results[i] = cached + return results + + def put_many(self, snippets: list[str], embeddings: list[list[float]]) -> None: + for snippet, embedding in zip(snippets, embeddings): + self.put(snippet, embedding) + + def save(self) -> None: + if self._path is None: + return + try: + self._path.parent.mkdir(parents=True, exist_ok=True) + with self._path.open("w", encoding="utf-8") as f: + json.dump(self._cache, f) + except Exception as e: + logger.warning(ls.EMBEDDING_CACHE_SAVE_FAILED, path=self._path, error=e) + + def load(self) -> None: + if self._path is None or not self._path.exists(): + return + try: + with self._path.open("r", encoding="utf-8") as f: + self._cache = json.load(f) + logger.debug( + ls.EMBEDDING_CACHE_LOADED, count=len(self._cache), path=self._path + ) + except Exception as e: + logger.warning(ls.EMBEDDING_CACHE_LOAD_FAILED, path=self._path, error=e) + self._cache = {} + + def clear(self) -> None: + self._cache.clear() + + def __len__(self) -> int: + return len(self._cache) + + +_embedding_cache: EmbeddingCache | None = None + + +def get_embedding_cache() -> EmbeddingCache: + global _embedding_cache + if _embedding_cache is None: + cache_path = Path(settings.QDRANT_DB_PATH) / cs.EMBEDDING_CACHE_FILENAME + _embedding_cache = EmbeddingCache(path=cache_path) + _embedding_cache.load() + return _embedding_cache + + +def clear_embedding_cache() -> None: + global _embedding_cache + if _embedding_cache is not None: + _embedding_cache.clear() + _embedding_cache = None + + if has_torch() and has_transformers(): import numpy as np import torch @@ -23,13 +100,17 @@ @lru_cache(maxsize=1) def get_model() -> UniXcoder: - model = UniXcoder(UNIXCODER_MODEL) + model = UniXcoder(cs.UNIXCODER_MODEL) model.eval() if torch.cuda.is_available(): model = model.cuda() return model def embed_code(code: str, max_length: int | None = None) -> list[float]: + cache = get_embedding_cache() + if (cached := cache.get(code)) is not None: + return cached + if max_length is None: max_length = settings.EMBEDDING_MAX_LENGTH model = get_model() @@ -40,9 +121,63 @@ def embed_code(code: str, max_length: int | None = None) -> list[float]: _, sentence_embeddings = model(tokens_tensor) embedding: NDArray[np.float32] = sentence_embeddings.cpu().numpy() result: list[float] = embedding[0].tolist() + + cache.put(code, result) return result + def embed_code_batch( + snippets: list[str], + max_length: int | None = None, + batch_size: int = cs.EMBEDDING_DEFAULT_BATCH_SIZE, + ) -> list[list[float]]: + if not snippets: + return [] + + if max_length is None: + max_length = settings.EMBEDDING_MAX_LENGTH + + cache = get_embedding_cache() + cached_results = cache.get_many(snippets) + + if len(cached_results) == len(snippets): + logger.debug(ls.EMBEDDING_CACHE_HIT, count=len(snippets)) + return [cached_results[i] for i in range(len(snippets))] + + uncached_indices = [i for i in range(len(snippets)) if i not in cached_results] + uncached_snippets = [snippets[i] for i in uncached_indices] + + model = get_model() + device = next(model.parameters()).device + + all_new_embeddings: list[list[float]] = [] + for start in range(0, len(uncached_snippets), batch_size): + batch = uncached_snippets[start : start + batch_size] + tokens_list = model.tokenize(batch, max_length=max_length, padding=True) + tokens_tensor = torch.tensor(tokens_list).to(device) + with torch.no_grad(): + _, sentence_embeddings = model(tokens_tensor) + batch_np: NDArray[np.float32] = sentence_embeddings.cpu().numpy() + for row in batch_np: + all_new_embeddings.append(row.tolist()) + + cache.put_many(uncached_snippets, all_new_embeddings) + + results: list[list[float]] = [[] for _ in snippets] + for i, emb in cached_results.items(): + results[i] = emb + for idx, orig_i in enumerate(uncached_indices): + results[orig_i] = all_new_embeddings[idx] + + return results + else: def embed_code(code: str, max_length: int | None = None) -> list[float]: raise RuntimeError(ex.SEMANTIC_EXTRA) + + def embed_code_batch( + snippets: list[str], + max_length: int | None = None, + batch_size: int = cs.EMBEDDING_DEFAULT_BATCH_SIZE, + ) -> list[list[float]]: + raise RuntimeError(ex.SEMANTIC_EXTRA) diff --git a/codebase_rag/exceptions.py b/codebase_rag/exceptions.py index f30202395..3349cd431 100644 --- a/codebase_rag/exceptions.py +++ b/codebase_rag/exceptions.py @@ -11,10 +11,26 @@ "OpenAI provider requires api_key. " "Set ORCHESTRATOR_API_KEY or CYPHER_API_KEY in .env file." ) +ANTHROPIC_NO_KEY = ( + "Anthropic provider requires api_key. " + "Set ORCHESTRATOR_API_KEY or CYPHER_API_KEY in .env file." +) +AZURE_NO_KEY = "Azure OpenAI provider requires api_key. Set AZURE_API_KEY in .env file." +AZURE_NO_ENDPOINT = ( + "Azure OpenAI provider requires endpoint. Set AZURE_OPENAI_ENDPOINT in .env file." +) OLLAMA_NOT_RUNNING = ( "Ollama server not responding at {endpoint}. " "Make sure Ollama is running: ollama serve" ) +LITELLM_NO_ENDPOINT = ( + "LiteLLM provider requires endpoint. " + "Set ORCHESTRATOR_ENDPOINT or CYPHER_ENDPOINT in .env file." +) +LITELLM_NOT_RUNNING = ( + "LiteLLM proxy server not responding at {endpoint}. " + "Make sure LiteLLM proxy is running and API key is valid." +) UNKNOWN_PROVIDER = "Unknown provider '{provider}'. Available providers: {available}" # (H) Dependency errors @@ -42,12 +58,17 @@ # (H) LLM errors LLM_INIT_CYPHER = "Failed to initialize CypherGenerator: {error}" LLM_INVALID_QUERY = "LLM did not generate a valid query. Output: {output}" +LLM_DANGEROUS_QUERY = "LLM generated a destructive Cypher query (found '{keyword}'). Query rejected: {query}" LLM_GENERATION_FAILED = "Cypher generation failed: {error}" LLM_INIT_ORCHESTRATOR = "Failed to initialize RAG Orchestrator: {error}" # (H) Graph service errors BATCH_SIZE = "batch_size must be a positive integer" CONN = "Not connected to Memgraph." +AUTH_INCOMPLETE = ( + "Both username and password are required for authentication. " + "Either provide both or neither." +) # (H) Access control errors (used with raise) ACCESS_DENIED = "Access denied: Cannot access files outside the project root." diff --git a/codebase_rag/graph_loader.py b/codebase_rag/graph_loader.py index b69635755..6a210c6d5 100644 --- a/codebase_rag/graph_loader.py +++ b/codebase_rag/graph_loader.py @@ -13,6 +13,18 @@ class GraphLoader: + __slots__ = ( + "file_path", + "_data", + "_nodes", + "_relationships", + "_nodes_by_id", + "_nodes_by_label", + "_outgoing_rels", + "_incoming_rels", + "_property_indexes", + ) + def __init__(self, file_path: str): self.file_path = Path(file_path) self._data: GraphData | None = None diff --git a/codebase_rag/graph_updater.py b/codebase_rag/graph_updater.py index 2620d2bcb..3b4d10c6b 100644 --- a/codebase_rag/graph_updater.py +++ b/codebase_rag/graph_updater.py @@ -1,9 +1,13 @@ +import hashlib +import json +import os import sys from collections import OrderedDict, defaultdict from collections.abc import Callable, ItemsView, KeysView from pathlib import Path from loguru import logger +from rich.progress import Progress, SpinnerColumn, TextColumn from tree_sitter import Node, Parser from . import constants as cs @@ -27,8 +31,12 @@ from .utils.path_utils import should_skip_path from .utils.source_extraction import extract_source_with_fallback +type FileHashCache = dict[str, str] + class FunctionRegistryTrie: + __slots__ = ("root", "_entries", "_simple_name_lookup") + def __init__(self, simple_name_lookup: SimpleNameLookup | None = None) -> None: self.root: TrieNode = {} self._entries: FunctionRegistry = {} @@ -150,9 +158,9 @@ def find_with_prefix_and_suffix( def find_ending_with(self, suffix: str) -> list[QualifiedName]: if self._simple_name_lookup is not None and suffix in self._simple_name_lookup: # (H) O(1) lookup using the simple_name_lookup index - return list(self._simple_name_lookup[suffix]) + return sorted(self._simple_name_lookup[suffix]) # (H) Fallback to linear scan if no index available - return [qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")] + return sorted(qn for qn in self._entries.keys() if qn.endswith(f".{suffix}")) def find_with_prefix(self, prefix: str) -> list[tuple[QualifiedName, NodeType]]: node = self._navigate_to_prefix(prefix) @@ -160,6 +168,8 @@ def find_with_prefix(self, prefix: str) -> list[tuple[QualifiedName, NodeType]]: class BoundedASTCache: + __slots__ = ("cache", "max_entries", "max_memory_bytes") + def __init__( self, max_entries: int | None = None, @@ -220,6 +230,38 @@ def _should_evict_for_memory(self) -> bool: ) +def _hash_file(filepath: Path) -> str: + hasher = hashlib.sha256() + with filepath.open("rb") as f: + while chunk := f.read(8192): + hasher.update(chunk) + return hasher.hexdigest() + + +def _load_hash_cache(cache_path: Path) -> FileHashCache: + if not cache_path.is_file(): + return {} + try: + with cache_path.open(encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict): + logger.info(ls.HASH_CACHE_LOADED, count=len(data), path=cache_path) + return data + except (json.JSONDecodeError, OSError) as e: + logger.warning(ls.HASH_CACHE_LOAD_FAILED, path=cache_path, error=e) + return {} + + +def _save_hash_cache(cache_path: Path, hashes: FileHashCache) -> None: + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + with cache_path.open("w", encoding="utf-8") as f: + json.dump(hashes, f, indent=2) + logger.info(ls.HASH_CACHE_SAVED, count=len(hashes), path=cache_path) + except OSError as e: + logger.warning(ls.HASH_CACHE_SAVE_FAILED, path=cache_path, error=e) + + class GraphUpdater: def __init__( self, @@ -229,12 +271,20 @@ def __init__( queries: dict[cs.SupportedLanguage, LanguageQueries], unignore_paths: frozenset[str] | None = None, exclude_paths: frozenset[str] | None = None, + project_name: str | None = None, ): self.ingestor = ingestor + self._single_file: Path | None = None + if repo_path.is_file(): + resolved = repo_path.resolve() + self._single_file = resolved + repo_path = resolved.parent self.repo_path = repo_path self.parsers = parsers self.queries = queries - self.project_name = repo_path.resolve().name + self.project_name = ( + project_name and project_name.strip() + ) or repo_path.resolve().name self.simple_name_lookup: SimpleNameLookup = defaultdict(set) self.function_registry = FunctionRegistryTrie( simple_name_lookup=self.simple_name_lookup @@ -261,19 +311,23 @@ def _is_dependency_file(self, file_name: str, filepath: Path) -> bool: or filepath.suffix.lower() == cs.CSPROJ_SUFFIX ) - def run(self) -> None: + def run(self, force: bool = False) -> None: self.ingestor.ensure_node_batch( cs.NODE_PROJECT, {cs.KEY_NAME: self.project_name} ) - logger.info(ls.ENSURING_PROJECT.format(name=self.project_name)) + logger.info(ls.ENSURING_PROJECT, name=self.project_name) logger.info(ls.PASS_1_STRUCTURE) self.factory.structure_processor.identify_structure() logger.info(ls.PASS_2_FILES) - self._process_files() + self._process_files(force=force) - logger.info(ls.FOUND_FUNCTIONS.format(count=len(self.function_registry))) + corrected = self.factory.definition_processor.resolve_deferred_cpp_methods() + if corrected: + logger.info("Resolved {} deferred C++ out-of-class methods", corrected) + + logger.info(ls.FOUND_FUNCTIONS, count=len(self.function_registry)) logger.info(ls.PASS_3_CALLS) self._process_function_calls() @@ -282,10 +336,12 @@ def run(self) -> None: logger.info(ls.ANALYSIS_COMPLETE) self.ingestor.flush_all() + self._prune_orphan_nodes() + self._generate_semantic_embeddings() def remove_file_from_state(self, file_path: Path) -> None: - logger.debug(ls.REMOVING_STATE.format(path=file_path)) + logger.debug(ls.REMOVING_STATE, path=file_path) if file_path in self.ast_cache: del self.ast_cache[file_path] @@ -307,45 +363,163 @@ def remove_file_from_state(self, file_path: Path) -> None: del self.function_registry[qn] if qns_to_remove: - logger.debug(ls.REMOVING_QNS.format(count=len(qns_to_remove))) + logger.debug(ls.REMOVING_QNS, count=len(qns_to_remove)) for simple_name, qn_set in self.simple_name_lookup.items(): original_count = len(qn_set) new_qn_set = qn_set - qns_to_remove if len(new_qn_set) < original_count: self.simple_name_lookup[simple_name] = new_qn_set - logger.debug(ls.CLEANED_SIMPLE_NAME.format(name=simple_name)) + logger.debug(ls.CLEANED_SIMPLE_NAME, name=simple_name) + + def _should_keep_dir(self, dirname: str, dir_prefix: str) -> bool: + if dirname not in cs.IGNORE_PATTERNS and ( + not self.exclude_paths or dirname not in self.exclude_paths + ): + return True + return bool( + self.unignore_paths + and any( + u.startswith(f"{dir_prefix}{dirname}/") or u == f"{dir_prefix}{dirname}" + for u in self.unignore_paths + ) + ) - def _process_files(self) -> None: - for filepath in self.repo_path.rglob("*"): - if filepath.is_file() and not should_skip_path( - filepath, + def _collect_eligible_files(self) -> list[Path]: + if self._single_file is not None: + if not should_skip_path( + self._single_file, self.repo_path, exclude_paths=self.exclude_paths, unignore_paths=self.unignore_paths, ): - lang_config = get_language_spec(filepath.suffix) + return [self._single_file] + return [] + + eligible: list[Path] = [] + hash_name = cs.HASH_CACHE_FILENAME + for dirpath, dirnames, filenames in os.walk(str(self.repo_path)): + rel_dir = Path(dirpath).relative_to(self.repo_path).as_posix() + dir_prefix = "" if rel_dir == "." else f"{rel_dir}/" + dirnames[:] = sorted( + d for d in dirnames if self._should_keep_dir(d, dir_prefix) + ) + for fname in sorted(filenames): + if fname == hash_name: + continue + filepath = Path(dirpath) / fname + if not should_skip_path( + filepath, + self.repo_path, + exclude_paths=self.exclude_paths, + unignore_paths=self.unignore_paths, + ): + eligible.append(filepath) + return eligible + + def _process_files(self, force: bool = False) -> None: + cache_path = self.repo_path / cs.HASH_CACHE_FILENAME + old_hashes = _load_hash_cache(cache_path) if not force else {} + if force: + logger.info(ls.INCREMENTAL_FORCE) + + eligible_files = self._collect_eligible_files() + new_hashes: FileHashCache = {} + skipped_count = 0 + changed_count = 0 + + current_file_keys: set[str] = set() + + processed_since_flush = 0 + + with Progress( + SpinnerColumn(), + TextColumn(ls.PROGRESS_INDEXING_LABEL), + TextColumn("[progress.description]{task.description}"), + transient=True, + ) as progress: + task = progress.add_task("", total=len(eligible_files)) + + for filepath in eligible_files: + file_key = str(filepath.relative_to(self.repo_path)) + current_file_keys.add(file_key) + + current_hash = _hash_file(filepath) + new_hashes[file_key] = current_hash + if ( - lang_config - and isinstance(lang_config.language, cs.SupportedLanguage) - and lang_config.language in self.parsers + not force + and file_key in old_hashes + and old_hashes[file_key] == current_hash ): - result = self.factory.definition_processor.process_file( - filepath, - lang_config.language, - self.queries, - self.factory.structure_processor.structural_elements, - ) - if result: - root_node, language = result - self.ast_cache[filepath] = (root_node, language) - elif self._is_dependency_file(filepath.name, filepath): - self.factory.definition_processor.process_dependencies(filepath) - - self.factory.structure_processor.process_generic_file( - filepath, filepath.name + logger.debug(ls.FILE_HASH_UNCHANGED, path=file_key) + skipped_count += 1 + progress.advance(task) + continue + + if file_key in old_hashes: + logger.debug(ls.FILE_HASH_CHANGED, path=file_key) + self.remove_file_from_state(filepath) + else: + logger.debug(ls.FILE_HASH_NEW, path=file_key) + + changed_count += 1 + self._process_single_file(filepath) + + processed_since_flush += 1 + if processed_since_flush >= settings.FILE_FLUSH_INTERVAL: + logger.info(ls.PERIODIC_FLUSH.format(count=processed_since_flush)) + self.ingestor.flush_all() + processed_since_flush = 0 + + progress.update( + task, + advance=1, + description=ls.PROGRESS_FILES_PROCESSED.format(count=changed_count), ) + deleted_keys = set(old_hashes.keys()) - current_file_keys + if deleted_keys: + logger.info(ls.INCREMENTAL_DELETED, count=len(deleted_keys)) + for deleted_key in deleted_keys: + deleted_path = self.repo_path / deleted_key + self.remove_file_from_state(deleted_path) + if isinstance(self.ingestor, QueryProtocol): + self.ingestor.execute_write( + cs.CYPHER_DELETE_MODULE, {cs.KEY_PATH: deleted_key} + ) + self.ingestor.execute_write( + cs.CYPHER_DELETE_FILE, {cs.KEY_PATH: deleted_key} + ) + + if skipped_count > 0: + logger.info(ls.INCREMENTAL_SKIPPED, count=skipped_count) + if changed_count > 0: + logger.info(ls.INCREMENTAL_CHANGED, count=changed_count) + + _save_hash_cache(cache_path, new_hashes) + + def _process_single_file(self, filepath: Path) -> None: + lang_config = get_language_spec(filepath.suffix) + if ( + lang_config + and isinstance(lang_config.language, cs.SupportedLanguage) + and lang_config.language in self.parsers + ): + result = self.factory.definition_processor.process_file( + filepath, + lang_config.language, + self.queries, + self.factory.structure_processor.structural_elements, + ) + if result: + root_node, language = result + self.ast_cache[filepath] = (root_node, language) + elif self._is_dependency_file(filepath.name, filepath): + self.factory.definition_processor.process_dependencies(filepath) + + self.factory.structure_processor.process_generic_file(filepath, filepath.name) + def _process_function_calls(self) -> None: ast_cache_items = list(self.ast_cache.items()) for file_path, (root_node, language) in ast_cache_items: @@ -353,6 +527,56 @@ def _process_function_calls(self) -> None: file_path, root_node, language, self.queries ) + def _prune_orphan_nodes(self) -> None: + """Remove graph nodes whose files/folders no longer exist on disk.""" + if not isinstance(self.ingestor, QueryProtocol): + return + + logger.info(ls.PRUNE_START) + total_pruned = 0 + + project_prefix = self.project_name + "." + repo_abs = self.repo_path.resolve().as_posix() + prune_specs: list[tuple[str, str, str]] = [ + (cs.CYPHER_ALL_FILE_PATHS, cs.CYPHER_DELETE_FILE, "File"), + ( + cs.CYPHER_ALL_MODULE_PATHS_INTERNAL, + cs.CYPHER_DELETE_MODULE, + "Module", + ), + (cs.CYPHER_ALL_FOLDER_PATHS, cs.CYPHER_DELETE_FOLDER, "Folder"), + ] + + for query_all, delete_query, label in prune_specs: + rows = self.ingestor.fetch_all(query_all) + orphans = [] + for r in rows: + path = r.get("path") + if not isinstance(path, str) or not path: + continue + abs_path = r.get("absolute_path") + qn = r.get("qualified_name", "") + if isinstance(abs_path, str) and not abs_path.startswith(repo_abs): + continue + if isinstance(qn, str) and qn and not qn.startswith(project_prefix): + continue + if not (self.repo_path / path).exists(): + orphans.append(path) + + if orphans: + logger.info(ls.PRUNE_FOUND, count=len(orphans), label=label) + for orphan_path in orphans: + logger.debug(ls.PRUNE_DELETING, label=label, path=orphan_path) + self.ingestor.execute_write( + delete_query, {cs.KEY_PATH: orphan_path} + ) + total_pruned += len(orphans) + + if total_pruned: + logger.info(ls.PRUNE_COMPLETE, count=total_pruned) + else: + logger.info(ls.PRUNE_SKIP) + def _generate_semantic_embeddings(self) -> None: if not has_semantic_dependencies(): logger.info(ls.SEMANTIC_NOT_AVAILABLE) @@ -363,22 +587,30 @@ def _generate_semantic_embeddings(self) -> None: return try: - from .embedder import embed_code - from .vector_store import store_embedding + from .embedder import embed_code, get_embedding_cache + from .vector_store import ( + close_qdrant_client, + store_embedding_batch, + verify_stored_ids, + ) logger.info(ls.PASS_4_EMBEDDINGS) results = self.ingestor.fetch_all( - cs.CYPHER_QUERY_EMBEDDINGS, {"project_name": self.project_name + "."} + cs.CYPHER_QUERY_EMBEDDINGS, {"project_name": self.project_name} ) if not results: logger.info(ls.NO_FUNCTIONS_FOR_EMBEDDING) return - logger.info(ls.GENERATING_EMBEDDINGS.format(count=len(results))) + logger.info(ls.GENERATING_EMBEDDINGS, count=len(results)) embedded_count = 0 + expected_ids: set[int] = set() + batch_buffer: list[tuple[int, list[float], str]] = [] + batch_size = settings.QDRANT_BATCH_SIZE + for row in results: parsed = self._parse_embedding_result(row) if parsed is None: @@ -391,33 +623,74 @@ def _generate_semantic_embeddings(self) -> None: file_path = parsed.get(cs.KEY_PATH) if start_line is None or end_line is None or file_path is None: - logger.debug(ls.NO_SOURCE_FOR.format(name=qualified_name)) + logger.debug(ls.NO_SOURCE_FOR, name=qualified_name) + continue - elif source_code := self._extract_source_code( + if source_code := self._extract_source_code( qualified_name, file_path, start_line, end_line ): try: embedding = embed_code(source_code) - store_embedding(node_id, embedding, qualified_name) - embedded_count += 1 + batch_buffer.append((node_id, embedding, qualified_name)) + expected_ids.add(node_id) - if embedded_count % settings.EMBEDDING_PROGRESS_INTERVAL == 0: + if len(batch_buffer) >= batch_size: + embedded_count += store_embedding_batch(batch_buffer) + batch_buffer = [] + + if ( + embedded_count % settings.EMBEDDING_PROGRESS_INTERVAL == 0 + and embedded_count > 0 + ): logger.debug( - ls.EMBEDDING_PROGRESS.format( - done=embedded_count, total=len(results) - ) + ls.EMBEDDING_PROGRESS, + done=embedded_count, + total=len(results), ) except Exception as e: logger.warning( - ls.EMBEDDING_FAILED.format(name=qualified_name, error=e) + ls.EMBEDDING_FAILED, name=qualified_name, error=e ) else: - logger.debug(ls.NO_SOURCE_FOR.format(name=qualified_name)) - logger.info(ls.EMBEDDINGS_COMPLETE.format(count=embedded_count)) + logger.debug(ls.NO_SOURCE_FOR, name=qualified_name) + + if batch_buffer: + embedded_count += store_embedding_batch(batch_buffer) + + logger.info(ls.EMBEDDINGS_COMPLETE, count=embedded_count) + + self._reconcile_embeddings(expected_ids, verify_stored_ids) + get_embedding_cache().save() + close_qdrant_client() + + except Exception as e: + logger.warning(ls.EMBEDDING_GENERATION_FAILED, error=e) + + def _reconcile_embeddings( + self, + expected_ids: set[int], + verify_fn: Callable[[set[int]], set[int]], + ) -> None: + if not expected_ids: + return + try: + stored_ids = verify_fn(expected_ids) + missing = expected_ids - stored_ids + if missing: + sample = sorted(missing)[:10] + logger.warning( + ls.EMBEDDING_RECONCILE_MISSING.format( + missing=len(missing), + expected=len(expected_ids), + sample_ids=sample, + ) + ) + else: + logger.info(ls.EMBEDDING_RECONCILE_OK.format(count=len(expected_ids))) except Exception as e: - logger.warning(ls.EMBEDDING_GENERATION_FAILED.format(error=e)) + logger.warning(ls.EMBEDDING_RECONCILE_FAILED.format(error=e)) def _extract_source_code( self, qualified_name: str, file_path: str, start_line: int, end_line: int diff --git a/codebase_rag/language_spec.py b/codebase_rag/language_spec.py index cf550ab08..4802fb3c8 100644 --- a/codebase_rag/language_spec.py +++ b/codebase_rag/language_spec.py @@ -97,6 +97,38 @@ def _rust_file_to_module(file_path: Path, repo_root: Path) -> list[str]: return [] +def _php_file_to_module(file_path: Path, repo_root: Path) -> list[str]: + try: + rel = file_path.relative_to(repo_root) + parts = list(rel.with_suffix("").parts) + if parts and parts[0] in ("src", "app", "lib"): + parts = parts[1:] + return parts + except ValueError: + return [] + + +def _c_unwrap_declarator(declarator: Node | None) -> Node | None: + while declarator and declarator.type == cs.CppNodeType.POINTER_DECLARATOR: + declarator = declarator.child_by_field_name(cs.FIELD_DECLARATOR) + return declarator + + +def _c_get_name(node: Node) -> str | None: + if node.type in cs.C_NAME_NODE_TYPES: + name_node = node.child_by_field_name(cs.FIELD_NAME) + if name_node and name_node.text: + return name_node.text.decode(cs.ENCODING_UTF8) + elif node.type == cs.TS_CPP_FUNCTION_DEFINITION: + declarator = node.child_by_field_name(cs.FIELD_DECLARATOR) + declarator = _c_unwrap_declarator(declarator) + if declarator and declarator.type == cs.TS_CPP_FUNCTION_DECLARATOR: + name_node = declarator.child_by_field_name(cs.FIELD_DECLARATOR) + if name_node and name_node.type == cs.TS_IDENTIFIER and name_node.text: + return name_node.text.decode(cs.ENCODING_UTF8) + return _generic_get_name(node) + + def _cpp_get_name(node: Node) -> str | None: if node.type in cs.CPP_NAME_NODE_TYPES: name_node = node.child_by_field_name(cs.FIELD_NAME) @@ -154,6 +186,13 @@ def _cpp_get_name(node: Node) -> str | None: file_to_module_parts=_generic_file_to_module, ) +C_FQN_SPEC = FQNSpec( + scope_node_types=frozenset(cs.FQN_C_SCOPE_TYPES), + function_node_types=frozenset(cs.FQN_C_FUNCTION_TYPES), + get_name=_c_get_name, + file_to_module_parts=_generic_file_to_module, +) + LUA_FQN_SPEC = FQNSpec( scope_node_types=frozenset(cs.FQN_LUA_SCOPE_TYPES), function_node_types=frozenset(cs.FQN_LUA_FUNCTION_TYPES), @@ -186,7 +225,7 @@ def _cpp_get_name(node: Node) -> str | None: scope_node_types=frozenset(cs.FQN_PHP_SCOPE_TYPES), function_node_types=frozenset(cs.FQN_PHP_FUNCTION_TYPES), get_name=_generic_get_name, - file_to_module_parts=_generic_file_to_module, + file_to_module_parts=_php_file_to_module, ) LANGUAGE_FQN_SPECS: dict[cs.SupportedLanguage, FQNSpec] = { @@ -195,6 +234,7 @@ def _cpp_get_name(node: Node) -> str | None: cs.SupportedLanguage.TS: TS_FQN_SPEC, cs.SupportedLanguage.RUST: RUST_FQN_SPEC, cs.SupportedLanguage.JAVA: JAVA_FQN_SPEC, + cs.SupportedLanguage.C: C_FQN_SPEC, cs.SupportedLanguage.CPP: CPP_FQN_SPEC, cs.SupportedLanguage.LUA: LUA_FQN_SPEC, cs.SupportedLanguage.GO: GO_FQN_SPEC, @@ -343,6 +383,28 @@ def _cpp_get_name(node: Node) -> str | None: type: (type_identifier) @name) @call """, ), + cs.SupportedLanguage.C: LanguageSpec( + language=cs.SupportedLanguage.C, + file_extensions=cs.C_EXTENSIONS, + function_node_types=cs.SPEC_C_FUNCTION_TYPES, + class_node_types=cs.SPEC_C_CLASS_TYPES, + module_node_types=cs.SPEC_C_MODULE_TYPES, + call_node_types=cs.SPEC_C_CALL_TYPES, + import_node_types=cs.IMPORT_NODES_INCLUDE, + import_from_node_types=cs.IMPORT_NODES_INCLUDE, + package_indicators=cs.SPEC_C_PACKAGE_INDICATORS, + function_query=""" + (function_definition) @function + """, + class_query=""" + (struct_specifier) @class + (union_specifier) @class + (enum_specifier) @class + """, + call_query=""" + (call_expression) @call + """, + ), cs.SupportedLanguage.CPP: LanguageSpec( language=cs.SupportedLanguage.CPP, file_extensions=cs.CPP_EXTENSIONS, @@ -398,6 +460,42 @@ def _cpp_get_name(node: Node) -> str | None: class_node_types=cs.SPEC_PHP_CLASS_TYPES, module_node_types=cs.SPEC_PHP_MODULE_TYPES, call_node_types=cs.SPEC_PHP_CALL_TYPES, + import_node_types=cs.SPEC_PHP_IMPORT_TYPES, + import_from_node_types=cs.SPEC_PHP_IMPORT_FROM_TYPES, + function_query=""" + (function_definition + name: (name) @name) @function + (method_declaration + name: (name) @name) @function + (anonymous_function) @function + (arrow_function) @function + """, + class_query=""" + (class_declaration + name: (name) @name) @class + (interface_declaration + name: (name) @name) @class + (trait_declaration + name: (name) @name) @class + (enum_declaration + name: (name) @name) @class + """, + call_query=""" + (function_call_expression + function: (name) @name) @call + (function_call_expression + function: (qualified_name) @name) @call + (member_call_expression + name: (name) @name) @call + (scoped_call_expression + name: (name) @name) @call + (nullsafe_member_call_expression + name: (name) @name) @call + (object_creation_expression + (name) @name) @call + (object_creation_expression + (qualified_name) @name) @call + """, ), cs.SupportedLanguage.LUA: LanguageSpec( language=cs.SupportedLanguage.LUA, diff --git a/codebase_rag/logs.py b/codebase_rag/logs.py index 3e075c877..f73baf15e 100644 --- a/codebase_rag/logs.py +++ b/codebase_rag/logs.py @@ -49,7 +49,22 @@ EMBEDDINGS_COMPLETE = "Successfully generated {count} semantic embeddings" EMBEDDING_GENERATION_FAILED = "Failed to generate semantic embeddings: {error}" EMBEDDING_STORE_FAILED = "Failed to store embedding for {name}: {error}" +EMBEDDING_STORE_RETRY = "Qdrant upsert failed (attempt {attempt}/{max_attempts}), retrying in {delay:.1f}s: {error}" +EMBEDDING_BATCH_STORED = "Stored batch of {count} embeddings in Qdrant" +EMBEDDING_BATCH_FAILED = "Failed to store embedding batch: {error}" EMBEDDING_SEARCH_FAILED = "Failed to search embeddings: {error}" +EMBEDDING_RECONCILE_OK = "Qdrant reconciliation: all {count} expected embeddings found" +EMBEDDING_RECONCILE_MISSING = "Qdrant reconciliation: {missing} of {expected} embeddings missing (IDs: {sample_ids})" +EMBEDDING_RECONCILE_FAILED = "Qdrant reconciliation check failed: {error}" +QDRANT_DELETE_PROJECT = "Deleting {count} Qdrant vectors for project '{project}'" +QDRANT_DELETE_PROJECT_DONE = "Deleted Qdrant vectors for project '{project}'" +QDRANT_DELETE_PROJECT_FAILED = ( + "Failed to delete Qdrant vectors for project '{project}': {error}" +) +EMBEDDING_CACHE_HIT = "Embedding cache hit for {count} snippets" +EMBEDDING_CACHE_LOADED = "Loaded embedding cache with {count} entries from {path}" +EMBEDDING_CACHE_SAVE_FAILED = "Failed to save embedding cache to {path}: {error}" +EMBEDDING_CACHE_LOAD_FAILED = "Failed to load embedding cache from {path}: {error}" # (H) Image logs IMAGE_COPIED = "Copied image to temporary path: {path}" @@ -97,8 +112,25 @@ # (H) File watcher logs WATCHER_ACTIVE = "File watcher is now active." +WATCHER_DEBOUNCE_ACTIVE = ( + "File watcher active with debouncing (debounce={debounce}s, max_wait={max_wait}s)" +) WATCHER_SKIP_NO_QUERY = "Ingestor does not support querying, skipping real-time update." CHANGE_DETECTED = "Change detected: {event_type} on {path}. Updating graph." +CHANGE_DEBOUNCING = ( + "Change detected: {event_type} on {name} (debouncing for {debounce}s)" +) +DEBOUNCE_RESET = "Reset debounce timer for {path}" +DEBOUNCE_MAX_WAIT = "Max wait ({max_wait}s) exceeded for {path}, processing now" +DEBOUNCE_SCHEDULED = ( + "Scheduled update for {path} in {debounce}s (max wait: {remaining}s remaining)" +) +DEBOUNCE_PROCESSING = "Processing debounced change: {path}" +DEBOUNCE_NO_EVENT = "No pending event for {path}, skipping" +DEBOUNCE_MAX_WAIT_ADJUSTED = ( + "max_wait ({max_wait}s) is less than debounce ({debounce}s). " + "Setting max_wait to debounce value." +) DELETION_QUERY = "Ran deletion query for path: {path}" RECALC_CALLS = "Recalculating all function call relationships for consistency..." GRAPH_UPDATED = "Graph updated successfully for change in: {name}" @@ -155,7 +187,8 @@ # (H) Memgraph logs MG_CONNECTING = "Connecting to Memgraph at {host}:{port}..." MG_CONNECTED = "Successfully connected to Memgraph." -MG_EXCEPTION = "An exception occurred: {error}. Flushing remaining items..." +MG_EXCEPTION = "An exception occurred: {error}. Attempting best-effort flush..." +MG_FLUSH_ERROR = "Failed to flush during cleanup: {error}" MG_DISCONNECTED = "\nDisconnected from Memgraph." MG_CYPHER_ERROR = "!!! Cypher Error: {error}" MG_CYPHER_QUERY = " Query: {query}" @@ -177,7 +210,9 @@ "Relationship buffer reached batch size ({size}). Performing incremental flush." ) MG_NO_CONSTRAINT = "No unique constraint defined for label '{label}'. Skipping flush." -MG_MISSING_PROP = "Skipping {label} node missing required '{key}' property: {props}" +MG_MISSING_PROP = ( + "Skipping {label} node missing required '{key}' property (keys: {prop_keys})" +) MG_NODES_FLUSHED = "Flushed {flushed} of {total} buffered nodes." MG_NODES_SKIPPED = ( "Skipped {count} buffered nodes due to missing identifiers or constraints." @@ -189,6 +224,18 @@ ) MG_FLUSH_START = "--- Flushing all pending writes to database... ---" MG_FLUSH_COMPLETE = "--- Flushing complete. ---" +MG_PARALLEL_FLUSH_NODES = ( + "Parallel flushing {count} label groups with {workers} workers" +) +MG_PARALLEL_FLUSH_RELS = ( + "Parallel flushing {count} relationship groups with {workers} workers" +) +MG_LABEL_FLUSH_ERROR = "Error flushing label group '{label}': {error}" +MG_REL_FLUSH_ERROR = "Error flushing relationship group '{pattern}': {error}" +MG_NO_CONN_NODES = "No database connection for label '{label}', skipping flush." +MG_NO_CONN_RELS = ( + "No database connection for relationship group '{pattern}', skipping flush." +) MG_FETCH_QUERY = "Executing fetch query: {query} with params: {params}" MG_WRITE_QUERY = "Executing write query: {query} with params: {params}" MG_EXPORTING = "Exporting graph data..." @@ -215,6 +262,10 @@ ) TOOL_QUERY_RECEIVED = "[Tool:QueryGraph] Received NL query: '{query}'" TOOL_QUERY_ERROR = "[Tool:QueryGraph] Error during query execution: {error}" +QUERY_RESULTS_TRUNCATED = ( + "[Tool:QueryGraph] Results truncated: showing {kept} of {total} rows " + "({tokens} tokens, limit {max_tokens})" +) TOOL_SHELL_EXEC = "Executing shell command: {cmd}" TOOL_SHELL_RETURN = "Return code: {code}" TOOL_SHELL_STDOUT = "Stdout: {stdout}" @@ -312,6 +363,7 @@ # (H) Error logs (used with logger.error/warning) UNEXPECTED = "An unexpected error occurred: {error}" EXPORT_ERROR = "Export error: {error}" +STATS_ERROR = "Stats error: {error}" INDEXING_FAILED = "Indexing failed" PATH_NOT_IN_QUESTION = ( "Could not find original path in question for replacement: {path}" @@ -324,6 +376,7 @@ CALL_PROCESSING_FILE = "Processing calls in cached AST for: {path}" CALL_PROCESSING_FAILED = "Failed to process calls in {path}: {error}" CALL_FOUND_NODES = "Found {count} call nodes in {language} for {caller}" +CALL_SKIP_CLASS = "Skipping CALLS edge from {caller} to {call_name} (callee is Class node: {callee_qn})" CALL_FOUND = ( "Found call from {caller} to {call_name} (resolved as {callee_type}:{callee_qn})" ) @@ -593,6 +646,14 @@ MCP_ERROR_WRITE = "[MCP] Error writing file: {error}" MCP_LIST_DIR = "[MCP] list_directory: {path}" MCP_ERROR_LIST_DIR = "[MCP] Error listing directory: {error}" +MCP_SEMANTIC_NOT_AVAILABLE = ( + "[MCP] Semantic search not available. Install with: uv sync --extra semantic" +) +MCP_UPDATING_REPO = "[MCP] Updating repository at: {path}" +MCP_ERROR_UPDATING = "[MCP] Error updating repository: {error}" +MCP_SEMANTIC_SEARCH = "[MCP] semantic_search: {query}" +MCP_ASK_AGENT = "[MCP] ask_agent: {question}" +MCP_ASK_AGENT_ERROR = "[MCP] Error running ask_agent: {error}" # (H) MCP server logs MCP_SERVER_INFERRED_ROOT = "[GraphCode MCP] Using inferred project root: {path}" @@ -612,6 +673,31 @@ MCP_SERVER_CONNECTED = "[GraphCode MCP] Connected to Memgraph at {host}:{port}" MCP_SERVER_FATAL_ERROR = "[GraphCode MCP] Fatal error: {error}" MCP_SERVER_SHUTDOWN = "[GraphCode MCP] Shutting down server..." +MCP_HTTP_SERVER_STARTING = "[GraphCode MCP] Starting HTTP server on {host}:{port}..." +MCP_HTTP_SERVER_READY = ( + "[GraphCode MCP] HTTP server ready. MCP endpoint: http://{host}:{port}/mcp" +) + +# (H) Incremental update logs +HASH_CACHE_LOADED = "Loaded hash cache with {count} entries from {path}" +HASH_CACHE_LOAD_FAILED = "Failed to load hash cache from {path}: {error}" +HASH_CACHE_SAVED = "Saved hash cache with {count} entries to {path}" +HASH_CACHE_SAVE_FAILED = "Failed to save hash cache to {path}: {error}" +PERIODIC_FLUSH = "Periodic flush after {count} files processed" +INCREMENTAL_SKIPPED = "Skipped {count} unchanged files" +INCREMENTAL_CHANGED = "Re-indexing {count} changed files" +INCREMENTAL_DELETED = "Removed state for {count} deleted files" +INCREMENTAL_FORCE = "Force mode enabled, bypassing hash cache" + +# (H) Orphan pruning logs +PRUNE_START = "--- Pruning orphan nodes from graph ---" +PRUNE_FOUND = "Found {count} orphan {label} nodes to remove" +PRUNE_DELETING = "Pruning orphan {label}: {path}" +PRUNE_COMPLETE = "Pruning complete. Removed {count} orphan nodes." +PRUNE_SKIP = "No orphan nodes found. Graph is clean." +FILE_HASH_UNCHANGED = "File unchanged (hash match): {path}" +FILE_HASH_CHANGED = "File changed (hash mismatch): {path}" +FILE_HASH_NEW = "New file detected: {path}" # (H) Exclude prompt logs EXCLUDE_INVALID_INDEX = "Invalid index: {index} (out of range)" @@ -621,3 +707,7 @@ MODEL_SWITCHED = "Model switched to: {model}" MODEL_SWITCH_FAILED = "Failed to switch model: {error}" MODEL_CURRENT = "Current model: {model}" + +# (H) Progress bar logs +PROGRESS_INDEXING_LABEL = "[bold blue]Indexing files..." +PROGRESS_FILES_PROCESSED = "{count} processed" diff --git a/codebase_rag/main.py b/codebase_rag/main.py index af58a84a4..0a74c0b15 100644 --- a/codebase_rag/main.py +++ b/codebase_rag/main.py @@ -752,6 +752,8 @@ def connect_memgraph(batch_size: int) -> MemgraphIngestor: host=settings.MEMGRAPH_HOST, port=settings.MEMGRAPH_PORT, batch_size=batch_size, + username=settings.MEMGRAPH_USERNAME, + password=settings.MEMGRAPH_PASSWORD, ) @@ -1021,13 +1023,25 @@ def _initialize_services_and_agent( return rag_agent, confirmation_tool_names +def main_single_query(repo_path: str, batch_size: int, question: str) -> None: + _setup_common_initialization(repo_path) + # (H) Override logger to stderr so stdout is clean for scripted output + logger.remove() + logger.add(sys.stderr, level=cs.LOG_LEVEL_ERROR, format=cs.LOG_FORMAT) + + with connect_memgraph(batch_size) as ingestor: + rag_agent, _ = _initialize_services_and_agent(repo_path, ingestor) + response = asyncio.run(rag_agent.run(question, message_history=[])) + print(response.output) # noqa: T201 + + async def main_async(repo_path: str, batch_size: int) -> None: project_root = _setup_common_initialization(repo_path) table = _create_configuration_table(repo_path) app_context.console.print(table) - with connect_memgraph(batch_size) as ingestor: + async with connect_memgraph(batch_size) as ingestor: app_context.console.print(style(cs.MSG_CONNECTED_MEMGRAPH, cs.Color.GREEN)) app_context.console.print( Panel( @@ -1063,7 +1077,7 @@ async def main_optimize_async( effective_batch_size = settings.resolve_batch_size(batch_size) - with connect_memgraph(effective_batch_size) as ingestor: + async with connect_memgraph(effective_batch_size) as ingestor: app_context.console.print(style(cs.MSG_CONNECTED_MEMGRAPH, cs.Color.GREEN)) rag_agent, tool_names = _initialize_services_and_agent( diff --git a/codebase_rag/mcp/__init__.py b/codebase_rag/mcp/__init__.py index 77c80d78a..f3a26b0b7 100644 --- a/codebase_rag/mcp/__init__.py +++ b/codebase_rag/mcp/__init__.py @@ -1 +1,2 @@ -from codebase_rag.mcp.server import main as main +from codebase_rag.mcp.server import serve_http as serve_http +from codebase_rag.mcp.server import serve_stdio as serve_stdio diff --git a/codebase_rag/mcp/client.py b/codebase_rag/mcp/client.py new file mode 100644 index 000000000..b6abb205d --- /dev/null +++ b/codebase_rag/mcp/client.py @@ -0,0 +1,65 @@ +import asyncio +import io +import json +import os +import sys + +import typer +from mcp import ClientSession +from mcp.client.stdio import StdioServerParameters, stdio_client + +from codebase_rag import constants as cs + +app = typer.Typer() + + +async def _query_with_errlog(question: str, errlog: io.TextIOWrapper) -> dict[str, str]: + server_params = StdioServerParameters( + command=sys.executable, + args=["-m", "codebase_rag.cli", "mcp-server"], + ) + + async with stdio_client(server=server_params, errlog=errlog) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + result = await session.call_tool( + cs.MCPToolName.ASK_AGENT, + {cs.MCPParamName.QUESTION: question}, + ) + + if result.content: + response_text = result.content[0].text + try: + parsed = json.loads(response_text) + if isinstance(parsed, dict): + return parsed + return {"output": str(parsed)} + except json.JSONDecodeError: + return {"output": response_text} + return {"output": "No response from server"} + + +def query_mcp_server(question: str) -> dict[str, str]: + with open(os.devnull, "w") as devnull: # noqa: SIM115 + return asyncio.run(_query_with_errlog(question, devnull)) + + +@app.command() +def main( + question: str = typer.Option( + ..., "--ask-agent", "-a", help="Question to ask about the codebase" + ), +) -> None: + try: + result = query_mcp_server(question) + if isinstance(result, dict) and "output" in result: + print(result["output"]) # noqa: T201 + else: + print(json.dumps(result)) # noqa: T201 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) # noqa: T201 + sys.exit(1) + + +if __name__ == "__main__": + app() diff --git a/codebase_rag/mcp/server.py b/codebase_rag/mcp/server.py index 9218a2d93..ff5a5ce0d 100644 --- a/codebase_rag/mcp/server.py +++ b/codebase_rag/mcp/server.py @@ -71,6 +71,8 @@ def create_server() -> tuple[Server, MemgraphIngestor]: host=settings.MEMGRAPH_HOST, port=settings.MEMGRAPH_PORT, batch_size=settings.MEMGRAPH_BATCH_SIZE, + username=settings.MEMGRAPH_USERNAME, + password=settings.MEMGRAPH_PASSWORD, ) cypher_generator = CypherGenerator() @@ -135,7 +137,7 @@ async def call_tool(name: str, arguments: MCPToolArguments) -> list[TextContent] return server, ingestor -async def main() -> None: +async def serve_stdio() -> None: logger.info(lg.MCP_SERVER_STARTING) server, ingestor = create_server() @@ -159,7 +161,52 @@ async def main() -> None: logger.info(lg.MCP_SERVER_SHUTDOWN) +async def serve_http( + host: str = settings.MCP_HTTP_HOST, + port: int = settings.MCP_HTTP_PORT, +) -> None: + import contextlib + + import uvicorn + from mcp.server.streamable_http_manager import StreamableHTTPSessionManager + from starlette.applications import Starlette + from starlette.routing import Mount + + logger.info(lg.MCP_HTTP_SERVER_STARTING.format(host=host, port=port)) + + server, ingestor = create_server() + + session_manager = StreamableHTTPSessionManager( + app=server, + json_response=False, + stateless=False, + ) + + @contextlib.asynccontextmanager + async def lifespan(app: Starlette): + with ingestor: + logger.info( + lg.MCP_SERVER_CONNECTED.format( + host=settings.MEMGRAPH_HOST, port=settings.MEMGRAPH_PORT + ) + ) + async with session_manager.run(): + logger.info(lg.MCP_HTTP_SERVER_READY.format(host=host, port=port)) + yield + + starlette_app = Starlette( + routes=[ + Mount(settings.MCP_HTTP_ENDPOINT_PATH, app=session_manager.handle_request), + ], + lifespan=lifespan, + ) + + config = uvicorn.Config(starlette_app, host=host, port=port, log_level="info") + uvicorn_server = uvicorn.Server(config) + await uvicorn_server.serve() + + if __name__ == "__main__": import asyncio - asyncio.run(main()) + asyncio.run(serve_stdio()) diff --git a/codebase_rag/mcp/tools.py b/codebase_rag/mcp/tools.py index 5d1d2f7f5..f340fa4fd 100644 --- a/codebase_rag/mcp/tools.py +++ b/codebase_rag/mcp/tools.py @@ -1,7 +1,11 @@ +import asyncio import itertools +import sys from pathlib import Path from loguru import logger +from pydantic_ai import Agent +from rich.console import Console from codebase_rag import constants as cs from codebase_rag import logs as lg @@ -10,17 +14,25 @@ from codebase_rag.models import ToolMetadata from codebase_rag.parser_loader import load_parsers from codebase_rag.services.graph_service import MemgraphIngestor -from codebase_rag.services.llm import CypherGenerator +from codebase_rag.services.llm import CypherGenerator, create_rag_orchestrator from codebase_rag.tools import tool_descriptions as td -from codebase_rag.tools.code_retrieval import CodeRetriever, create_code_retrieval_tool +from codebase_rag.tools.code_retrieval import ( + CodeRetriever, + create_code_retrieval_tool, +) from codebase_rag.tools.codebase_query import create_query_tool from codebase_rag.tools.directory_lister import ( DirectoryLister, create_directory_lister_tool, ) +from codebase_rag.tools.document_analyzer import ( + DocumentAnalyzer, + create_document_analyzer_tool, +) from codebase_rag.tools.file_editor import FileEditor, create_file_editor_tool from codebase_rag.tools.file_reader import FileReader, create_file_reader_tool from codebase_rag.tools.file_writer import FileWriter, create_file_writer_tool +from codebase_rag.tools.shell_command import ShellCommander, create_shell_command_tool from codebase_rag.types_defs import ( CodeSnippetResultDict, DeleteProjectErrorResult, @@ -35,6 +47,8 @@ MCPToolSchema, QueryResultDict, ) +from codebase_rag.utils.dependencies import has_semantic_dependencies +from codebase_rag.vector_store import delete_project_embeddings class MCPToolsRegistry: @@ -47,6 +61,7 @@ def __init__( self.project_root = project_root self.ingestor = ingestor self.cypher_gen = cypher_gen + self._ingestor_lock = asyncio.Lock() self.parsers, self.queries = load_parsers() @@ -55,9 +70,12 @@ def __init__( self.file_reader = FileReader(project_root=project_root) self.file_writer = FileWriter(project_root=project_root) self.directory_lister = DirectoryLister(project_root=project_root) + self.shell_commander = ShellCommander(project_root=project_root) + self.document_analyzer = DocumentAnalyzer(project_root=project_root) + stderr_console = Console(file=sys.stderr, width=None, force_terminal=True) self._query_tool = create_query_tool( - ingestor=ingestor, cypher_gen=cypher_gen, console=None + ingestor=ingestor, cypher_gen=cypher_gen, console=stderr_console ) self._code_tool = create_code_retrieval_tool(code_retriever=self.code_retriever) self._file_editor_tool = create_file_editor_tool(file_editor=self.file_editor) @@ -66,6 +84,27 @@ def __init__( self._directory_lister_tool = create_directory_lister_tool( directory_lister=self.directory_lister ) + self._shell_command_tool = create_shell_command_tool( + shell_commander=self.shell_commander + ) + self._document_analyzer_tool = create_document_analyzer_tool( + self.document_analyzer + ) + + self._rag_agent: Agent | None = None + + self._semantic_search_tool = None + self._semantic_search_available = False + + if has_semantic_dependencies(): + from codebase_rag.tools.semantic_search import ( + create_semantic_search_tool, + ) + + self._semantic_search_tool = create_semantic_search_tool() + self._semantic_search_available = True + else: + logger.info(lg.MCP_SEMANTIC_NOT_AVAILABLE) self._tools: dict[str, ToolMetadata] = { cs.MCPToolName.LIST_PROJECTS: ToolMetadata( @@ -122,6 +161,17 @@ def __init__( handler=self.index_repository, returns_json=False, ), + cs.MCPToolName.UPDATE_REPOSITORY: ToolMetadata( + name=cs.MCPToolName.UPDATE_REPOSITORY, + description=td.MCP_TOOLS[cs.MCPToolName.UPDATE_REPOSITORY], + input_schema=MCPInputSchema( + type=cs.MCPSchemaType.OBJECT, + properties={}, + required=[], + ), + handler=self.update_repository, + returns_json=False, + ), cs.MCPToolName.QUERY_CODE_GRAPH: ToolMetadata( name=cs.MCPToolName.QUERY_CODE_GRAPH, description=td.MCP_TOOLS[cs.MCPToolName.QUERY_CODE_GRAPH], @@ -247,33 +297,121 @@ def __init__( returns_json=False, ), } + if self._semantic_search_available: + self._tools[cs.MCPToolName.SEMANTIC_SEARCH] = ToolMetadata( + name=cs.MCPToolName.SEMANTIC_SEARCH, + description=td.MCP_TOOLS[cs.MCPToolName.SEMANTIC_SEARCH], + input_schema=MCPInputSchema( + type=cs.MCPSchemaType.OBJECT, + properties={ + cs.MCPParamName.NATURAL_LANGUAGE_QUERY: MCPInputSchemaProperty( + type=cs.MCPSchemaType.STRING, + description=td.MCP_PARAM_NATURAL_LANGUAGE_QUERY, + ), + cs.MCPParamName.TOP_K: MCPInputSchemaProperty( + type=cs.MCPSchemaType.INTEGER, + description=td.MCP_PARAM_TOP_K, + default=5, + ), + }, + required=[cs.MCPParamName.NATURAL_LANGUAGE_QUERY], + ), + handler=self.semantic_search, + returns_json=False, + ) + + self._tools[cs.MCPToolName.ASK_AGENT] = ToolMetadata( + name=cs.MCPToolName.ASK_AGENT, + description=td.MCP_TOOLS[cs.MCPToolName.ASK_AGENT], + input_schema=MCPInputSchema( + type=cs.MCPSchemaType.OBJECT, + properties={ + cs.MCPParamName.QUESTION: MCPInputSchemaProperty( + type=cs.MCPSchemaType.STRING, + description=td.MCP_PARAM_QUESTION, + ) + }, + required=[cs.MCPParamName.QUESTION], + ), + handler=self.ask_agent, + returns_json=True, + ) + + @property + def rag_agent(self) -> Agent: + if self._rag_agent is None: + from codebase_rag.tools.semantic_search import ( + create_get_function_source_tool, + ) + + tools = [ + self._query_tool, + self._code_tool, + self._file_reader_tool, + self._file_writer_tool, + self._file_editor_tool, + self._shell_command_tool, + self._directory_lister_tool, + self._document_analyzer_tool, + create_get_function_source_tool(), + ] + if self._semantic_search_tool is not None: + tools.append(self._semantic_search_tool) + self._rag_agent = create_rag_orchestrator(tools=tools) + return self._rag_agent + + # (H) Setter allows tests to inject a mock agent without triggering LLM init + @rag_agent.setter + def rag_agent(self, value: Agent) -> None: + self._rag_agent = value async def list_projects(self) -> ListProjectsResult: logger.info(lg.MCP_LISTING_PROJECTS) try: - projects = self.ingestor.list_projects() + projects = await asyncio.to_thread(self.ingestor.list_projects) return ListProjectsSuccessResult(projects=projects, count=len(projects)) except Exception as e: logger.error(lg.MCP_ERROR_LIST_PROJECTS.format(error=e)) return ListProjectsErrorResult(error=str(e), projects=[], count=0) + def _get_project_node_ids(self, project_name: str) -> list[int]: + rows = self.ingestor.fetch_all( + cs.CYPHER_QUERY_PROJECT_NODE_IDS, + {cs.KEY_PROJECT_NAME: project_name}, + ) + result: list[int] = [] + for row in rows: + node_id = row.get(cs.KEY_NODE_ID) + if isinstance(node_id, int): + result.append(node_id) + return result + + def _cleanup_project_embeddings(self, project_name: str) -> None: + node_ids = self._get_project_node_ids(project_name) + delete_project_embeddings(project_name, node_ids) + + def _delete_project_sync(self, project_name: str) -> DeleteProjectResult: + projects = self.ingestor.list_projects() + if project_name not in projects: + return DeleteProjectErrorResult( + success=False, + error=te.MCP_PROJECT_NOT_FOUND.format( + project_name=project_name, projects=projects + ), + ) + self._cleanup_project_embeddings(project_name) + self.ingestor.delete_project(project_name) + return DeleteProjectSuccessResult( + success=True, + project=project_name, + message=cs.MCP_PROJECT_DELETED.format(project_name=project_name), + ) + async def delete_project(self, project_name: str) -> DeleteProjectResult: logger.info(lg.MCP_DELETING_PROJECT.format(project_name=project_name)) try: - projects = self.ingestor.list_projects() - if project_name not in projects: - return DeleteProjectErrorResult( - success=False, - error=te.MCP_PROJECT_NOT_FOUND.format( - project_name=project_name, projects=projects - ), - ) - self.ingestor.delete_project(project_name) - return DeleteProjectSuccessResult( - success=True, - project=project_name, - message=cs.MCP_PROJECT_DELETED.format(project_name=project_name), - ) + async with self._ingestor_lock: + return await asyncio.to_thread(self._delete_project_sync, project_name) except Exception as e: logger.error(lg.MCP_ERROR_DELETE_PROJECT.format(error=e)) return DeleteProjectErrorResult(success=False, error=str(e)) @@ -283,34 +421,76 @@ async def wipe_database(self, confirm: bool) -> str: return cs.MCP_WIPE_CANCELLED logger.warning(lg.MCP_WIPING_DATABASE) try: - self.ingestor.clean_database() + async with self._ingestor_lock: + await asyncio.to_thread(self.ingestor.clean_database) return cs.MCP_WIPE_SUCCESS except Exception as e: logger.error(lg.MCP_ERROR_WIPE.format(error=e)) return cs.MCP_WIPE_ERROR.format(error=e) + def _index_repository_sync(self) -> str: + project_name = Path(self.project_root).resolve().name + logger.info(lg.MCP_CLEARING_PROJECT.format(project_name=project_name)) + self._cleanup_project_embeddings(project_name) + self.ingestor.delete_project(project_name) + + updater = GraphUpdater( + ingestor=self.ingestor, + repo_path=Path(self.project_root), + parsers=self.parsers, + queries=self.queries, + ) + updater.run() + + return cs.MCP_INDEX_SUCCESS_PROJECT.format( + path=self.project_root, project_name=project_name + ) + async def index_repository(self) -> str: logger.info(lg.MCP_INDEXING_REPO.format(path=self.project_root)) - project_name = Path(self.project_root).resolve().name try: - logger.info(lg.MCP_CLEARING_PROJECT.format(project_name=project_name)) - self.ingestor.delete_project(project_name) - - updater = GraphUpdater( - ingestor=self.ingestor, - repo_path=Path(self.project_root), - parsers=self.parsers, - queries=self.queries, - ) - updater.run() - - return cs.MCP_INDEX_SUCCESS_PROJECT.format( - path=self.project_root, project_name=project_name - ) + async with self._ingestor_lock: + return await asyncio.to_thread(self._index_repository_sync) except Exception as e: logger.error(lg.MCP_ERROR_INDEXING.format(error=e)) return cs.MCP_INDEX_ERROR.format(error=e) + def _update_repository_sync(self) -> str: + updater = GraphUpdater( + ingestor=self.ingestor, + repo_path=Path(self.project_root), + parsers=self.parsers, + queries=self.queries, + ) + updater.run() + return cs.MCP_UPDATE_SUCCESS.format(path=self.project_root) + + async def update_repository(self) -> str: + logger.info(lg.MCP_UPDATING_REPO.format(path=self.project_root)) + try: + async with self._ingestor_lock: + return await asyncio.to_thread(self._update_repository_sync) + except Exception as e: + logger.error(lg.MCP_ERROR_UPDATING.format(error=e)) + return cs.MCP_UPDATE_ERROR.format(error=e) + + async def semantic_search(self, natural_language_query: str, top_k: int = 5) -> str: + assert self._semantic_search_tool is not None + logger.info(lg.MCP_SEMANTIC_SEARCH.format(query=natural_language_query)) + result = await self._semantic_search_tool.function( + query=natural_language_query, top_k=top_k + ) + return str(result) + + async def ask_agent(self, question: str) -> dict[str, str]: + logger.info(lg.MCP_ASK_AGENT.format(question=question)) + try: + response = await self.rag_agent.run(question, message_history=[]) + return {"output": str(response.output)} + except Exception as e: + logger.error(lg.MCP_ASK_AGENT_ERROR.format(error=e)) + return {"error": cs.MCP_ASK_AGENT_ERROR.format(error=e)} + async def query_code_graph(self, natural_language_query: str) -> QueryResultDict: logger.info(lg.MCP_QUERY_CODE_GRAPH.format(query=natural_language_query)) try: diff --git a/codebase_rag/parser_loader.py b/codebase_rag/parser_loader.py index 69ddabda3..e19205e6f 100644 --- a/codebase_rag/parser_loader.py +++ b/codebase_rag/parser_loader.py @@ -33,7 +33,7 @@ def _try_load_from_submodule(lang_name: cs.SupportedLanguage) -> LanguageLoader: setup_py_path = submodule_path / cs.SETUP_PY if setup_py_path.exists(): - logger.debug(ls.BUILDING_BINDINGS.format(lang=lang_name)) + logger.debug(ls.BUILDING_BINDINGS, lang=lang_name) result = subprocess.run( [sys.executable, cs.SETUP_PY, cs.BUILD_EXT_CMD, cs.INPLACE_FLAG], check=False, @@ -44,14 +44,15 @@ def _try_load_from_submodule(lang_name: cs.SupportedLanguage) -> LanguageLoader: if result.returncode != 0: logger.debug( - ls.BUILD_FAILED.format( - lang=lang_name, stdout=result.stdout, stderr=result.stderr - ) + ls.BUILD_FAILED, + lang=lang_name, + stdout=result.stdout, + stderr=result.stderr, ) return None - logger.debug(ls.BUILD_SUCCESS.format(lang=lang_name)) + logger.debug(ls.BUILD_SUCCESS, lang=lang_name) - logger.debug(ls.IMPORTING_MODULE.format(module=module_name)) + logger.debug(ls.IMPORTING_MODULE, module=module_name) module = importlib.import_module(module_name) language_attrs: list[str] = [ @@ -63,21 +64,19 @@ def _try_load_from_submodule(lang_name: cs.SupportedLanguage) -> LanguageLoader: for attr_name in language_attrs: if hasattr(module, attr_name): logger.debug( - ls.LOADED_FROM_SUBMODULE.format(lang=lang_name, attr=attr_name) + ls.LOADED_FROM_SUBMODULE, lang=lang_name, attr=attr_name ) loader: LanguageLoader = getattr(module, attr_name) return loader - logger.debug( - ls.NO_LANG_ATTR.format(module=module_name, available=dir(module)) - ) + logger.debug(ls.NO_LANG_ATTR, module=module_name, available=dir(module)) finally: if python_bindings_str in sys.path: sys.path.remove(python_bindings_str) except Exception as e: - logger.debug(ls.SUBMODULE_LOAD_FAILED.format(lang=lang_name, error=e)) + logger.debug(ls.SUBMODULE_LOAD_FAILED, lang=lang_name, error=e) return None @@ -137,6 +136,12 @@ def _import_language_loaders() -> dict[cs.SupportedLanguage, LanguageLoader]: cs.QUERY_LANGUAGE, cs.SupportedLanguage.JAVA, ), + LanguageImport( + cs.SupportedLanguage.C, + cs.TreeSitterModule.C, + cs.QUERY_LANGUAGE, + cs.SupportedLanguage.C, + ), LanguageImport( cs.SupportedLanguage.CPP, cs.TreeSitterModule.CPP, @@ -149,6 +154,12 @@ def _import_language_loaders() -> dict[cs.SupportedLanguage, LanguageLoader]: cs.QUERY_LANGUAGE, cs.SupportedLanguage.LUA, ), + LanguageImport( + cs.SupportedLanguage.PHP, + cs.TreeSitterModule.PHP, + cs.LANG_ATTR_PHP, + cs.SupportedLanguage.PHP, + ), ] loaders: dict[cs.SupportedLanguage, LanguageLoader] = { @@ -215,7 +226,7 @@ def _create_locals_query( try: return Query(language, locals_pattern) except Exception as e: - logger.debug(ls.LOCALS_QUERY_FAILED.format(lang=lang_name, error=e)) + logger.debug(ls.LOCALS_QUERY_FAILED, lang=lang_name, error=e) return None @@ -256,7 +267,7 @@ def _process_language( ) -> bool: lang_lib = LANGUAGE_LIBRARIES.get(lang_name) if not lang_lib: - logger.debug(ls.LIB_NOT_AVAILABLE.format(lang=lang_name)) + logger.debug(ls.LIB_NOT_AVAILABLE, lang=lang_name) return False try: diff --git a/codebase_rag/parsers/call_processor.py b/codebase_rag/parsers/call_processor.py index 0e53cbe73..9655b2fc4 100644 --- a/codebase_rag/parsers/call_processor.py +++ b/codebase_rag/parsers/call_processor.py @@ -14,10 +14,12 @@ from .cpp import utils as cpp_utils from .import_processor import ImportProcessor from .type_inference import TypeInferenceEngine -from .utils import get_function_captures, is_method_node +from .utils import get_function_captures, is_method_node, sorted_captures class CallProcessor: + __slots__ = ("ingestor", "repo_path", "project_name", "_resolver") + def __init__( self, ingestor: IngestorProtocol, @@ -54,7 +56,7 @@ def process_calls_in_file( queries: dict[cs.SupportedLanguage, LanguageQueries], ) -> None: relative_path = file_path.relative_to(self.repo_path) - logger.debug(ls.CALL_PROCESSING_FILE.format(path=relative_path)) + logger.debug(ls.CALL_PROCESSING_FILE, path=relative_path) try: module_qn = cs.SEPARATOR_DOT.join( @@ -70,7 +72,7 @@ def process_calls_in_file( self._process_module_level_calls(root_node, module_qn, language, queries) except Exception as e: - logger.error(ls.CALL_PROCESSING_FAILED.format(path=file_path, error=e)) + logger.error(ls.CALL_PROCESSING_FAILED, path=file_path, error=e) def _process_calls_in_functions( self, @@ -141,12 +143,15 @@ def _process_methods_in_class( if not method_query: return method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(body_node) + method_captures = sorted_captures(method_cursor, body_node) method_nodes = method_captures.get(cs.CAPTURE_FUNCTION, []) for method_node in method_nodes: if not isinstance(method_node, Node): continue - method_name = self._get_node_name(method_node) + if language == cs.SupportedLanguage.CPP: + method_name = cpp_utils.extract_function_name(method_node) + else: + method_name = self._get_node_name(method_node) if not method_name: continue method_qn = f"{class_qn}{cs.SEPARATOR_DOT}{method_name}" @@ -171,7 +176,7 @@ def _process_calls_in_classes( if not query: return cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) class_nodes = captures.get(cs.CAPTURE_CLASS, []) for class_node in class_nodes: @@ -270,13 +275,14 @@ def _ingest_function_calls( ) cursor = QueryCursor(calls_query) - captures = cursor.captures(caller_node) + captures = sorted_captures(cursor, caller_node) call_nodes = captures.get(cs.CAPTURE_CALL, []) logger.debug( - ls.CALL_FOUND_NODES.format( - count=len(call_nodes), language=language, caller=caller_qn - ) + ls.CALL_FOUND_NODES, + count=len(call_nodes), + language=language, + caller=caller_qn, ) for call_node in call_nodes: @@ -310,13 +316,21 @@ def _ingest_function_calls( callee_type, callee_qn = operator_info else: continue - logger.debug( - ls.CALL_FOUND.format( + if callee_type == cs.NodeLabel.CLASS: + logger.debug( + ls.CALL_SKIP_CLASS, caller=caller_qn, call_name=call_name, - callee_type=callee_type, callee_qn=callee_qn, ) + continue + + logger.debug( + ls.CALL_FOUND, + caller=caller_qn, + call_name=call_name, + callee_type=callee_type, + callee_qn=callee_qn, ) self.ingestor.ensure_relationship_batch( @@ -337,9 +351,7 @@ def _build_nested_qualified_name( if not isinstance(current, Node): logger.warning( - ls.CALL_UNEXPECTED_PARENT.format( - node=func_node, parent_type=type(current) - ) + ls.CALL_UNEXPECTED_PARENT, node=func_node, parent_type=type(current) ) return None diff --git a/codebase_rag/parsers/call_resolver.py b/codebase_rag/parsers/call_resolver.py index 322a583a3..286b3ac50 100644 --- a/codebase_rag/parsers/call_resolver.py +++ b/codebase_rag/parsers/call_resolver.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from collections import deque from loguru import logger from tree_sitter import Node @@ -12,8 +13,18 @@ from .py import resolve_class_name from .type_inference import TypeInferenceEngine +_SEPARATOR_PATTERN = re.compile(r"[.:]|::") +_CHAINED_METHOD_PATTERN = re.compile(r"\.([^.()]+)$") + class CallResolver: + __slots__ = ( + "function_registry", + "import_processor", + "type_inference", + "class_inheritance", + ) + def __init__( self, function_registry: FunctionRegistryTrieProtocol, @@ -119,9 +130,7 @@ def _try_resolve_direct_import( return None imported_qn = import_map[call_name] if imported_qn in self.function_registry: - logger.debug( - ls.CALL_DIRECT_IMPORT.format(call_name=call_name, qn=imported_qn) - ) + logger.debug(ls.CALL_DIRECT_IMPORT, call_name=call_name, qn=imported_qn) return self.function_registry[imported_qn], imported_qn return None @@ -187,9 +196,7 @@ def _try_wildcard_qns( for wildcard_qn in potential_qns: if wildcard_qn in self.function_registry: - logger.debug( - ls.CALL_WILDCARD.format(call_name=call_name, qn=wildcard_qn) - ) + logger.debug(ls.CALL_WILDCARD, call_name=call_name, qn=wildcard_qn) return self.function_registry[wildcard_qn], wildcard_qn return None @@ -199,7 +206,7 @@ def _try_resolve_same_module( same_module_func_qn = f"{module_qn}.{call_name}" if same_module_func_qn in self.function_registry: logger.debug( - ls.CALL_SAME_MODULE.format(call_name=call_name, qn=same_module_func_qn) + ls.CALL_SAME_MODULE, call_name=call_name, qn=same_module_func_qn ) return self.function_registry[same_module_func_qn], same_module_func_qn return None @@ -207,19 +214,17 @@ def _try_resolve_same_module( def _try_resolve_via_trie( self, call_name: str, module_qn: str ) -> tuple[str, str] | None: - search_name = re.split(r"[.:]|::", call_name)[-1] + search_name = _SEPARATOR_PATTERN.split(call_name)[-1] possible_matches = self.function_registry.find_ending_with(search_name) if not possible_matches: - logger.debug(ls.CALL_UNRESOLVED.format(call_name=call_name)) + logger.debug(ls.CALL_UNRESOLVED, call_name=call_name) return None possible_matches.sort( - key=lambda qn: self._calculate_import_distance(qn, module_qn) + key=lambda qn: (self._calculate_import_distance(qn, module_qn), qn) ) best_candidate_qn = possible_matches[0] - logger.debug( - ls.CALL_TRIE_FALLBACK.format(call_name=call_name, qn=best_candidate_qn) - ) + logger.debug(ls.CALL_TRIE_FALLBACK, call_name=call_name, qn=best_candidate_qn) return self.function_registry[best_candidate_qn], best_candidate_qn def _resolve_two_part_call( @@ -293,23 +298,21 @@ def _try_method_on_class( method_qn = f"{class_qn}{separator}{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_TYPE_INFERRED.format( - call_name=call_name, - method_qn=method_qn, - obj=object_name, - var_type=var_type, - ) + ls.CALL_TYPE_INFERRED, + call_name=call_name, + method_qn=method_qn, + obj=object_name, + var_type=var_type, ) return self.function_registry[method_qn], method_qn if inherited := self._resolve_inherited_method(class_qn, method_name): logger.debug( - ls.CALL_TYPE_INFERRED_INHERITED.format( - call_name=call_name, - method_qn=inherited[1], - obj=object_name, - var_type=var_type, - ) + ls.CALL_TYPE_INFERRED_INHERITED, + call_name=call_name, + method_qn=inherited[1], + obj=object_name, + var_type=var_type, ) return inherited return None @@ -336,7 +339,7 @@ def _try_resolve_via_import( if method_qn in self.function_registry: logger.debug( - ls.CALL_IMPORT_STATIC.format(call_name=call_name, method_qn=method_qn) + ls.CALL_IMPORT_STATIC, call_name=call_name, method_qn=method_qn ) return self.function_registry[method_qn], method_qn return None @@ -377,7 +380,7 @@ def _try_resolve_module_method( method_qn = f"{module_qn}.{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_OBJECT_METHOD.format(call_name=call_name, method_qn=method_qn) + ls.CALL_OBJECT_METHOD, call_name=call_name, method_qn=method_qn ) return self.function_registry[method_qn], method_qn return None @@ -401,12 +404,11 @@ def _resolve_self_attribute_call( method_qn = f"{class_qn}.{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_INSTANCE_ATTR.format( - call_name=call_name, - method_qn=method_qn, - attr_ref=attribute_ref, - var_type=var_type, - ) + ls.CALL_INSTANCE_ATTR, + call_name=call_name, + method_qn=method_qn, + attr_ref=attribute_ref, + var_type=var_type, ) return self.function_registry[method_qn], method_qn @@ -414,12 +416,11 @@ def _resolve_self_attribute_call( class_qn, method_name ): logger.debug( - ls.CALL_INSTANCE_ATTR_INHERITED.format( - call_name=call_name, - method_qn=inherited_method[1], - attr_ref=attribute_ref, - var_type=var_type, - ) + ls.CALL_INSTANCE_ATTR_INHERITED, + call_name=call_name, + method_qn=inherited_method[1], + attr_ref=attribute_ref, + var_type=var_type, ) return inherited_method @@ -441,9 +442,9 @@ def _resolve_multi_part_call( method_qn = f"{class_qn}.{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_IMPORT_QUALIFIED.format( - call_name=call_name, method_qn=method_qn - ) + ls.CALL_IMPORT_QUALIFIED, + call_name=call_name, + method_qn=method_qn, ) return self.function_registry[method_qn], method_qn @@ -455,12 +456,11 @@ def _resolve_multi_part_call( method_qn = f"{class_qn}.{method_name}" if method_qn in self.function_registry: logger.debug( - ls.CALL_INSTANCE_QUALIFIED.format( - call_name=call_name, - method_qn=method_qn, - class_name=class_name, - var_type=var_type, - ) + ls.CALL_INSTANCE_QUALIFIED, + call_name=call_name, + method_qn=method_qn, + class_name=class_name, + var_type=var_type, ) return self.function_registry[method_qn], method_qn @@ -468,12 +468,11 @@ def _resolve_multi_part_call( class_qn, method_name ): logger.debug( - ls.CALL_INSTANCE_INHERITED.format( - call_name=call_name, - method_qn=inherited_method[1], - class_name=class_name, - var_type=var_type, - ) + ls.CALL_INSTANCE_INHERITED, + call_name=call_name, + method_qn=inherited_method[1], + class_name=class_name, + var_type=var_type, ) return inherited_method @@ -536,7 +535,7 @@ def _resolve_chained_call( module_qn: str, local_var_types: dict[str, str] | None = None, ) -> tuple[str, str] | None: - match = re.search(r"\.([^.()]+)$", call_name) + match = _CHAINED_METHOD_PATTERN.search(call_name) if not match: return None @@ -559,12 +558,11 @@ def _resolve_chained_call( if method_qn in self.function_registry: logger.debug( - ls.CALL_CHAINED.format( - call_name=call_name, - method_qn=method_qn, - obj_expr=object_expr, - obj_type=object_type, - ) + ls.CALL_CHAINED, + call_name=call_name, + method_qn=method_qn, + obj_expr=object_expr, + obj_type=object_type, ) return self.function_registry[method_qn], method_qn @@ -572,12 +570,11 @@ def _resolve_chained_call( full_object_type, final_method ): logger.debug( - ls.CALL_CHAINED_INHERITED.format( - call_name=call_name, - method_qn=inherited_method[1], - obj_expr=object_expr, - obj_type=object_type, - ) + ls.CALL_CHAINED_INHERITED, + call_name=call_name, + method_qn=inherited_method[1], + obj_expr=object_expr, + obj_type=object_type, ) return inherited_method @@ -596,31 +593,31 @@ def _resolve_super_call( current_class_qn = class_context if not current_class_qn: - logger.debug(ls.CALL_SUPER_NO_CONTEXT.format(call_name=call_name)) + logger.debug(ls.CALL_SUPER_NO_CONTEXT, call_name=call_name) return None if current_class_qn not in self.class_inheritance: - logger.debug(ls.CALL_SUPER_NO_INHERITANCE.format(class_qn=current_class_qn)) + logger.debug(ls.CALL_SUPER_NO_INHERITANCE, class_qn=current_class_qn) return None parent_classes = self.class_inheritance[current_class_qn] if not parent_classes: - logger.debug(ls.CALL_SUPER_NO_PARENTS.format(class_qn=current_class_qn)) + logger.debug(ls.CALL_SUPER_NO_PARENTS, class_qn=current_class_qn) return None if result := self._resolve_inherited_method(current_class_qn, method_name): callee_type, parent_method_qn = result logger.debug( - ls.CALL_SUPER_RESOLVED.format( - call_name=call_name, method_qn=parent_method_qn - ) + ls.CALL_SUPER_RESOLVED, + call_name=call_name, + method_qn=parent_method_qn, ) return callee_type, parent_method_qn logger.debug( - ls.CALL_SUPER_UNRESOLVED.format( - call_name=call_name, class_qn=current_class_qn - ) + ls.CALL_SUPER_UNRESOLVED, + call_name=call_name, + class_qn=current_class_qn, ) return None @@ -630,11 +627,11 @@ def _resolve_inherited_method( if class_qn not in self.class_inheritance: return None - queue = list(self.class_inheritance.get(class_qn, [])) - visited = set(queue) + bfs_queue = deque(self.class_inheritance.get(class_qn, [])) + visited = set(bfs_queue) - while queue: - parent_class_qn = queue.pop(0) + while bfs_queue: + parent_class_qn = bfs_queue.popleft() parent_method_qn = f"{parent_class_qn}.{method_name}" if parent_method_qn in self.function_registry: @@ -647,7 +644,7 @@ def _resolve_inherited_method( for grandparent_qn in self.class_inheritance[parent_class_qn]: if grandparent_qn not in visited: visited.add(grandparent_qn) - queue.append(grandparent_qn) + bfs_queue.append(grandparent_qn) return None @@ -697,7 +694,7 @@ def resolve_java_method_call( else cs.TEXT_UNKNOWN ) logger.debug( - ls.CALL_JAVA_RESOLVED.format(call_text=call_text, method_qn=result[1]) + ls.CALL_JAVA_RESOLVED, call_text=call_text, method_qn=result[1] ) return result diff --git a/codebase_rag/parsers/class_ingest/cpp_modules.py b/codebase_rag/parsers/class_ingest/cpp_modules.py index a5db9bc47..7a7a42c60 100644 --- a/codebase_rag/parsers/class_ingest/cpp_modules.py +++ b/codebase_rag/parsers/class_ingest/cpp_modules.py @@ -84,6 +84,7 @@ def _process_export_module( cs.KEY_QUALIFIED_NAME: interface_qn, cs.KEY_NAME: module_name, cs.KEY_PATH: str(file_path.relative_to(repo_path)), + cs.KEY_ABSOLUTE_PATH: file_path.resolve().as_posix(), cs.KEY_MODULE_TYPE: cs.CPP_MODULE_TYPE_INTERFACE, }, ) @@ -118,6 +119,7 @@ def _process_module_implementation( cs.KEY_QUALIFIED_NAME: impl_qn, cs.KEY_NAME: f"{module_name}{cs.CPP_IMPL_SUFFIX}", cs.KEY_PATH: str(file_path.relative_to(repo_path)), + cs.KEY_ABSOLUTE_PATH: file_path.resolve().as_posix(), cs.KEY_IMPLEMENTS_MODULE: module_name, cs.KEY_MODULE_TYPE: cs.CPP_MODULE_TYPE_IMPLEMENTATION, }, diff --git a/codebase_rag/parsers/class_ingest/method_override.py b/codebase_rag/parsers/class_ingest/method_override.py index 686ff26e6..9dfc8bedf 100644 --- a/codebase_rag/parsers/class_ingest/method_override.py +++ b/codebase_rag/parsers/class_ingest/method_override.py @@ -66,9 +66,9 @@ def check_method_overrides( (cs.NodeLabel.METHOD, cs.KEY_QUALIFIED_NAME, parent_method_qn), ) logger.debug( - logs.CLASS_METHOD_OVERRIDE.format( - method_qn=method_qn, parent_method_qn=parent_method_qn - ) + logs.CLASS_METHOD_OVERRIDE, + method_qn=method_qn, + parent_method_qn=parent_method_qn, ) return diff --git a/codebase_rag/parsers/class_ingest/mixin.py b/codebase_rag/parsers/class_ingest/mixin.py index 2ba3f8f8c..44afc6280 100644 --- a/codebase_rag/parsers/class_ingest/mixin.py +++ b/codebase_rag/parsers/class_ingest/mixin.py @@ -9,11 +9,12 @@ from ... import constants as cs from ... import logs +from ...language_spec import LanguageSpec from ...types_defs import ASTNode, PropertyDict from ..java import utils as java_utils from ..py import resolve_class_name from ..rs import utils as rs_utils -from ..utils import ingest_method, safe_decode_text +from ..utils import ingest_method, safe_decode_text, sorted_captures from . import cpp_modules from . import identity as id_ from . import method_override as mo @@ -21,7 +22,6 @@ from . import relationships as rel if TYPE_CHECKING: - from ...language_spec import LanguageSpec from ...services import IngestorProtocol from ...types_defs import ( FunctionRegistryTrieProtocol, @@ -31,7 +31,22 @@ from ..import_processor import ImportProcessor +def _is_nested_inside_function( + node: Node, class_body: Node, lang_config: LanguageSpec +) -> bool: + current = node.parent + while current and current is not class_body: + if ( + current.type in lang_config.function_node_types + and current.child_by_field_name(cs.FIELD_BODY) is not None + ): + return True + current = current.parent + return False + + class ClassIngestMixin: + __slots__ = () ingestor: IngestorProtocol repo_path: Path project_name: str @@ -81,7 +96,7 @@ def _ingest_classes_and_methods( lang_config: LanguageSpec = lang_queries[cs.QUERY_CONFIG] cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) class_nodes = captures.get(cs.CAPTURE_CLASS, []) module_nodes = captures.get(cs.ONEOF_MODULE, []) @@ -142,6 +157,9 @@ def _process_class_node( cs.KEY_DOCSTRING: self._get_docstring(class_node), cs.KEY_IS_EXPORTED: is_exported, } + if file_path is not None: + class_props[cs.KEY_PATH] = file_path.relative_to(self.repo_path).as_posix() + class_props[cs.KEY_ABSOLUTE_PATH] = file_path.resolve().as_posix() self.ingestor.ensure_node_batch(node_type, class_props) self.function_registry[class_qn] = node_type if class_name: @@ -160,7 +178,9 @@ def _process_class_node( self._resolve_to_qn, self.function_registry, ) - self._ingest_class_methods(class_node, class_qn, language, lang_queries) + self._ingest_class_methods( + class_node, class_qn, language, lang_queries, file_path + ) def _ingest_rust_impl_methods( self, @@ -179,20 +199,27 @@ def _ingest_rust_impl_methods( if not body_node or not method_query: return + file_path = self.module_qn_to_file_path.get(module_qn) + lang_config: LanguageSpec = lang_queries[cs.QUERY_CONFIG] method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(body_node) + method_captures = sorted_captures(method_cursor, body_node) for method_node in method_captures.get(cs.CAPTURE_FUNCTION, []): - if isinstance(method_node, Node): - ingest_method( - method_node, - class_qn, - cs.NodeLabel.CLASS, - self.ingestor, - self.function_registry, - self.simple_name_lookup, - self._get_docstring, - language, - ) + if not isinstance(method_node, Node): + continue + if _is_nested_inside_function(method_node, body_node, lang_config): + continue + ingest_method( + method_node, + class_qn, + cs.NodeLabel.CLASS, + self.ingestor, + self.function_registry, + self.simple_name_lookup, + self._get_docstring, + language, + file_path=file_path, + repo_path=self.repo_path, + ) def _ingest_class_methods( self, @@ -200,17 +227,21 @@ def _ingest_class_methods( class_qn: str, language: cs.SupportedLanguage, lang_queries: LanguageQueries, + file_path: Path | None = None, ) -> None: body_node = class_node.child_by_field_name("body") method_query = lang_queries[cs.QUERY_FUNCTIONS] if not body_node or not method_query: return + lang_config: LanguageSpec = lang_queries[cs.QUERY_CONFIG] method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(body_node) + method_captures = sorted_captures(method_cursor, body_node) for method_node in method_captures.get(cs.CAPTURE_FUNCTION, []): if not isinstance(method_node, Node): continue + if _is_nested_inside_function(method_node, body_node, lang_config): + continue method_qualified_name = None if language == cs.SupportedLanguage.JAVA: @@ -233,6 +264,8 @@ def _ingest_class_methods( language, self._extract_decorators, method_qualified_name, + file_path=file_path, + repo_path=self.repo_path, ) def _process_inline_modules( diff --git a/codebase_rag/parsers/class_ingest/node_type.py b/codebase_rag/parsers/class_ingest/node_type.py index 8cdf66d78..95c6237ea 100644 --- a/codebase_rag/parsers/class_ingest/node_type.py +++ b/codebase_rag/parsers/class_ingest/node_type.py @@ -16,19 +16,24 @@ def determine_node_type( language: cs.SupportedLanguage, ) -> NodeType: match class_node.type: - case cs.TS_INTERFACE_DECLARATION: + case cs.TS_INTERFACE_DECLARATION | cs.TS_RS_TRAIT_ITEM: logger.info(logs.CLASS_FOUND_INTERFACE.format(name=class_name, qn=class_qn)) return NodeType.INTERFACE - case cs.TS_ENUM_DECLARATION | cs.TS_ENUM_SPECIFIER | cs.TS_ENUM_CLASS_SPECIFIER: + case ( + cs.TS_ENUM_DECLARATION + | cs.TS_ENUM_SPECIFIER + | cs.TS_ENUM_CLASS_SPECIFIER + | cs.TS_RS_ENUM_ITEM + ): logger.info(logs.CLASS_FOUND_ENUM.format(name=class_name, qn=class_qn)) return NodeType.ENUM - case cs.TS_TYPE_ALIAS_DECLARATION: + case cs.TS_TYPE_ALIAS_DECLARATION | cs.TS_RS_TYPE_ITEM: logger.info(logs.CLASS_FOUND_TYPE.format(name=class_name, qn=class_qn)) return NodeType.TYPE - case cs.TS_STRUCT_SPECIFIER: + case cs.TS_STRUCT_SPECIFIER | cs.TS_RS_STRUCT_ITEM: logger.info(logs.CLASS_FOUND_STRUCT.format(name=class_name, qn=class_qn)) return NodeType.CLASS - case cs.TS_UNION_SPECIFIER: + case cs.TS_UNION_SPECIFIER | cs.TS_RS_UNION_ITEM: logger.info(logs.CLASS_FOUND_UNION.format(name=class_name, qn=class_qn)) return NodeType.UNION case cs.CppNodeType.TEMPLATE_DECLARATION: diff --git a/codebase_rag/parsers/class_ingest/parent_extraction.py b/codebase_rag/parsers/class_ingest/parent_extraction.py index 289e82c35..ac0085724 100644 --- a/codebase_rag/parsers/class_ingest/parent_extraction.py +++ b/codebase_rag/parsers/class_ingest/parent_extraction.py @@ -90,9 +90,9 @@ def parse_cpp_base_classes( ) parent_classes.append(parent_qn) logger.debug( - logs.CLASS_CPP_INHERITANCE.format( - parent_name=parent_name, parent_qn=parent_qn - ) + logs.CLASS_CPP_INHERITANCE, + parent_name=parent_name, + parent_qn=parent_qn, ) return parent_classes diff --git a/codebase_rag/parsers/definition_processor.py b/codebase_rag/parsers/definition_processor.py index 8110140f8..fb549e3ed 100644 --- a/codebase_rag/parsers/definition_processor.py +++ b/codebase_rag/parsers/definition_processor.py @@ -48,6 +48,7 @@ def __init__( self.import_processor = import_processor self.module_qn_to_file_path = module_qn_to_file_path self.class_inheritance: dict[str, list[str]] = {} + self._deferred_cpp_methods: list = [] self._handler = get_handler(cs.SupportedLanguage.PYTHON) def process_file( @@ -100,6 +101,7 @@ def process_file( cs.KEY_QUALIFIED_NAME: module_qn, cs.KEY_NAME: file_path.name, cs.KEY_PATH: relative_path_str, + cs.KEY_ABSOLUTE_PATH: file_path.resolve().as_posix(), }, ) diff --git a/codebase_rag/parsers/dependency_parser.py b/codebase_rag/parsers/dependency_parser.py index 61f7d4b92..94a66ad87 100644 --- a/codebase_rag/parsers/dependency_parser.py +++ b/codebase_rag/parsers/dependency_parser.py @@ -26,11 +26,15 @@ def _extract_pep508_package_name(dep_string: str) -> tuple[str, str]: class DependencyParser: + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: raise NotImplementedError class PyProjectTomlParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -72,6 +76,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class RequirementsTxtParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -92,6 +98,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class PackageJsonParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -120,6 +128,8 @@ def _load_and_collect_deps( class CargoTomlParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -148,6 +158,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class GoModParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -186,6 +198,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class GemfileParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -206,6 +220,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class ComposerJsonParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: @@ -229,6 +245,8 @@ def parse(self, file_path: Path) -> list[Dependency]: class CsprojParser(DependencyParser): + __slots__ = () + def parse(self, file_path: Path) -> list[Dependency]: dependencies: list[Dependency] = [] try: diff --git a/codebase_rag/parsers/factory.py b/codebase_rag/parsers/factory.py index a6b8a244c..3584ab325 100644 --- a/codebase_rag/parsers/factory.py +++ b/codebase_rag/parsers/factory.py @@ -16,6 +16,24 @@ class ProcessorFactory: + __slots__ = ( + "ingestor", + "repo_path", + "project_name", + "queries", + "function_registry", + "simple_name_lookup", + "ast_cache", + "unignore_paths", + "exclude_paths", + "module_qn_to_file_path", + "_import_processor", + "_structure_processor", + "_definition_processor", + "_type_inference", + "_call_processor", + ) + def __init__( self, ingestor: IngestorProtocol, diff --git a/codebase_rag/parsers/function_ingest.py b/codebase_rag/parsers/function_ingest.py index 1d32186e0..438fdc1af 100644 --- a/codebase_rag/parsers/function_ingest.py +++ b/codebase_rag/parsers/function_ingest.py @@ -40,7 +40,17 @@ class FunctionResolution(NamedTuple): is_exported: bool +class _DeferredMethod(NamedTuple): + """Out-of-class C++ method whose class hasn't been parsed yet.""" + + method_name: str + class_name: str + fallback_class_qn: str + method_props: PropertyDict + + class FunctionIngestMixin: + __slots__ = () ingestor: IngestorProtocol repo_path: Path project_name: str @@ -48,6 +58,7 @@ class FunctionIngestMixin: simple_name_lookup: SimpleNameLookup module_qn_to_file_path: dict[str, Path] _handler: LanguageHandler + _deferred_cpp_methods: list[_DeferredMethod] @abstractmethod def _get_docstring(self, node: ASTNode) -> str | None: ... @@ -147,6 +158,29 @@ def _fallback_function_resolution( func_node, module_qn, language, lang_config ) + def _resolve_cpp_class_qn( + self, class_name: str, module_qn: str + ) -> tuple[str, bool]: + """Look up an existing Class node for *class_name* across all parsed files. + + Returns ``(class_qn, resolved)`` where *resolved* is True when the + qualified name was obtained from the function registry (i.e. the + class has already been parsed, typically from a header file). + """ + class_name_normalized = class_name.replace( + cs.SEPARATOR_DOUBLE_COLON, cs.SEPARATOR_DOT + ) + leaf_name = class_name_normalized.rsplit(cs.SEPARATOR_DOT, 1)[-1] + + if leaf_name in self.simple_name_lookup: + for candidate_qn in self.simple_name_lookup[leaf_name]: + node_type = self.function_registry.get(candidate_qn) + if node_type in {NodeType.CLASS, NodeType.TYPE}: + if candidate_qn.endswith(f".{class_name_normalized}"): + return candidate_qn, True + + return f"{module_qn}.{class_name_normalized}", False + def _handle_cpp_out_of_class_method(self, func_node: Node, module_qn: str) -> bool: if not cpp_utils.is_out_of_class_method_definition(func_node): return False @@ -155,25 +189,86 @@ def _handle_cpp_out_of_class_method(self, func_node: Node, module_qn: str) -> bo if not class_name: return False - class_name_normalized = class_name.replace( - cs.SEPARATOR_DOUBLE_COLON, cs.SEPARATOR_DOT - ) - class_qn = f"{module_qn}.{class_name_normalized}" - - ingest_method( - method_node=func_node, - container_qn=class_qn, - container_type=cs.NodeLabel.CLASS, - ingestor=self.ingestor, - function_registry=self.function_registry, - simple_name_lookup=self.simple_name_lookup, - get_docstring_func=self._get_docstring, - language=cs.SupportedLanguage.CPP, - extract_decorators_func=self._extract_decorators, - ) + class_qn, resolved = self._resolve_cpp_class_qn(class_name, module_qn) + file_path = self.module_qn_to_file_path.get(module_qn) + + if resolved: + ingest_method( + method_node=func_node, + container_qn=class_qn, + container_type=cs.NodeLabel.CLASS, + ingestor=self.ingestor, + function_registry=self.function_registry, + simple_name_lookup=self.simple_name_lookup, + get_docstring_func=self._get_docstring, + language=cs.SupportedLanguage.CPP, + extract_decorators_func=self._extract_decorators, + file_path=file_path, + repo_path=self.repo_path, + ) + else: + method_name = cpp_utils.extract_function_name(func_node) + if not method_name: + return True + decorators = self._extract_decorators(func_node) + props: PropertyDict = { + cs.KEY_NAME: method_name, + cs.KEY_DECORATORS: decorators, + cs.KEY_START_LINE: func_node.start_point[0] + 1, + cs.KEY_END_LINE: func_node.end_point[0] + 1, + cs.KEY_DOCSTRING: self._get_docstring(func_node), + } + if file_path is not None and self.repo_path is not None: + props[cs.KEY_PATH] = file_path.relative_to(self.repo_path).as_posix() + props[cs.KEY_ABSOLUTE_PATH] = file_path.resolve().as_posix() + if not hasattr(self, "_deferred_cpp_methods"): + self._deferred_cpp_methods = [] + self._deferred_cpp_methods.append( + _DeferredMethod( + method_name=method_name, + class_name=class_name, + fallback_class_qn=class_qn, + method_props=props, + ) + ) return True + def resolve_deferred_cpp_methods(self) -> int: + """Ingest deferred out-of-class C++ methods now that all classes are known. + + Called after all files have been parsed so that every Class node + is guaranteed to be in the registry. Returns the number of + methods that were ingested. + """ + deferred = getattr(self, "_deferred_cpp_methods", None) + if not deferred: + return 0 + + ingested = 0 + for entry in deferred: + real_class_qn, resolved = self._resolve_cpp_class_qn(entry.class_name, "") + class_qn = real_class_qn if resolved else entry.fallback_class_qn + method_qn = f"{class_qn}.{entry.method_name}" + + props = dict(entry.method_props) + props[cs.KEY_QUALIFIED_NAME] = method_qn + + logger.info(ls.METHOD_FOUND.format(name=entry.method_name, qn=method_qn)) + self.ingestor.ensure_node_batch(cs.NodeLabel.METHOD, props) + self.function_registry[method_qn] = NodeType.METHOD + self.simple_name_lookup[entry.method_name].add(method_qn) + + self.ingestor.ensure_relationship_batch( + (cs.NodeLabel.CLASS, cs.KEY_QUALIFIED_NAME, class_qn), + cs.RelationshipType.DEFINES_METHOD, + (cs.NodeLabel.METHOD, cs.KEY_QUALIFIED_NAME, method_qn), + ) + ingested += 1 + + self._deferred_cpp_methods = [] + return ingested + def _resolve_cpp_function( self, func_node: Node, module_qn: str ) -> FunctionResolution | None: @@ -238,7 +333,7 @@ def _register_function( language: cs.SupportedLanguage, lang_config: LanguageSpec, ) -> None: - func_props = self._build_function_props(func_node, resolution) + func_props = self._build_function_props(func_node, resolution, module_qn) logger.info( ls.FUNC_FOUND.format(name=resolution.name, qn=resolution.qualified_name) ) @@ -253,9 +348,10 @@ def _register_function( ) def _build_function_props( - self, func_node: Node, resolution: FunctionResolution + self, func_node: Node, resolution: FunctionResolution, module_qn: str ) -> PropertyDict: - return { + file_path = self.module_qn_to_file_path.get(module_qn) + props: PropertyDict = { cs.KEY_QUALIFIED_NAME: resolution.qualified_name, cs.KEY_NAME: resolution.name, cs.KEY_DECORATORS: self._extract_decorators(func_node), @@ -264,6 +360,10 @@ def _build_function_props( cs.KEY_DOCSTRING: self._get_docstring(func_node), cs.KEY_IS_EXPORTED: resolution.is_exported, } + if file_path is not None: + props[cs.KEY_PATH] = file_path.relative_to(self.repo_path).as_posix() + props[cs.KEY_ABSOLUTE_PATH] = file_path.resolve().as_posix() + return props def _create_function_relationships( self, diff --git a/codebase_rag/parsers/handlers/base.py b/codebase_rag/parsers/handlers/base.py index 14fa8cec9..7f264c1e1 100644 --- a/codebase_rag/parsers/handlers/base.py +++ b/codebase_rag/parsers/handlers/base.py @@ -13,6 +13,8 @@ class BaseLanguageHandler: + __slots__ = () + def is_inside_method_with_object_literals(self, node: ASTNode) -> bool: return False diff --git a/codebase_rag/parsers/handlers/cpp.py b/codebase_rag/parsers/handlers/cpp.py index d7c9dea04..854bcc4ac 100644 --- a/codebase_rag/parsers/handlers/cpp.py +++ b/codebase_rag/parsers/handlers/cpp.py @@ -17,6 +17,8 @@ class CppHandler(BaseLanguageHandler): + __slots__ = () + def extract_function_name(self, node: ASTNode) -> str | None: if func_name := cpp_utils.extract_function_name(node): return func_name diff --git a/codebase_rag/parsers/handlers/java.py b/codebase_rag/parsers/handlers/java.py index 4bd576beb..882fae0da 100644 --- a/codebase_rag/parsers/handlers/java.py +++ b/codebase_rag/parsers/handlers/java.py @@ -11,6 +11,8 @@ class JavaHandler(BaseLanguageHandler): + __slots__ = () + def extract_decorators(self, node: ASTNode) -> list[str]: return java_utils.extract_from_modifiers_node(node, frozenset()).annotations diff --git a/codebase_rag/parsers/handlers/js_ts.py b/codebase_rag/parsers/handlers/js_ts.py index 7a2ed6684..75c561209 100644 --- a/codebase_rag/parsers/handlers/js_ts.py +++ b/codebase_rag/parsers/handlers/js_ts.py @@ -12,6 +12,8 @@ class JsTsHandler(BaseLanguageHandler): + __slots__ = () + def extract_decorators(self, node: ASTNode) -> list[str]: return [ decorator_text diff --git a/codebase_rag/parsers/handlers/lua.py b/codebase_rag/parsers/handlers/lua.py index 9db185904..6b2d6177f 100644 --- a/codebase_rag/parsers/handlers/lua.py +++ b/codebase_rag/parsers/handlers/lua.py @@ -11,6 +11,8 @@ class LuaHandler(BaseLanguageHandler): + __slots__ = () + def extract_function_name(self, node: ASTNode) -> str | None: if (name_node := node.child_by_field_name(cs.TS_FIELD_NAME)) and name_node.text: from ..utils import safe_decode_text diff --git a/codebase_rag/parsers/handlers/php.py b/codebase_rag/parsers/handlers/php.py new file mode 100644 index 000000000..e529ab7dd --- /dev/null +++ b/codebase_rag/parsers/handlers/php.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from ... import constants as cs +from ..utils import safe_decode_text +from .base import BaseLanguageHandler + +if TYPE_CHECKING: + from ...types_defs import ASTNode + + +class PhpHandler(BaseLanguageHandler): + __slots__ = () + + _CLASS_LIKE_TYPES = frozenset( + { + cs.TS_CLASS_DECLARATION, + cs.TS_INTERFACE_DECLARATION, + cs.TS_PHP_TRAIT_DECLARATION, + cs.TS_ENUM_DECLARATION, + } + ) + + def is_class_method(self, node: ASTNode) -> bool: + parent = node.parent + while parent: + if parent.type in self._CLASS_LIKE_TYPES: + return True + parent = parent.parent + return False + + def extract_function_name(self, node: ASTNode) -> str | None: + if node.type == cs.TS_PHP_ANONYMOUS_FUNCTION: + return f"anonymous_{node.start_point[0]}_{node.start_point[1]}" + if node.type == cs.TS_PHP_ARROW_FUNCTION: + return f"arrow_{node.start_point[0]}_{node.start_point[1]}" + name_node = node.child_by_field_name(cs.TS_FIELD_NAME) + if name_node and name_node.text: + return safe_decode_text(name_node) + return None + + def is_function_exported(self, node: ASTNode) -> bool: + if node.type != cs.TS_PHP_METHOD_DECLARATION: + return True + for child in node.children: + if child.type == cs.TS_PHP_VISIBILITY_MODIFIER: + text = safe_decode_text(child) + return text == "public" + return True + + def extract_decorators(self, node: ASTNode) -> list[str]: + decorators: list[str] = [] + for child in node.children: + if child.type == cs.TS_PHP_ATTRIBUTE_LIST: + for group in child.children: + if group.type == cs.TS_PHP_ATTRIBUTE_GROUP: + for attr in group.children: + if attr.type == cs.TS_PHP_ATTRIBUTE: + if text := safe_decode_text(attr): + decorators.append(text) + return decorators diff --git a/codebase_rag/parsers/handlers/protocol.py b/codebase_rag/parsers/handlers/protocol.py index 9bdbe72b6..893888d78 100644 --- a/codebase_rag/parsers/handlers/protocol.py +++ b/codebase_rag/parsers/handlers/protocol.py @@ -10,6 +10,8 @@ class LanguageHandler(Protocol): + __slots__ = () + def is_inside_method_with_object_literals(self, node: ASTNode) -> bool: ... def is_class_method(self, node: ASTNode) -> bool: ... diff --git a/codebase_rag/parsers/handlers/python.py b/codebase_rag/parsers/handlers/python.py index ae96501a5..1c424fdd8 100644 --- a/codebase_rag/parsers/handlers/python.py +++ b/codebase_rag/parsers/handlers/python.py @@ -11,6 +11,8 @@ class PythonHandler(BaseLanguageHandler): + __slots__ = () + def extract_decorators(self, node: ASTNode) -> list[str]: if not node.parent or node.parent.type != cs.TS_PY_DECORATED_DEFINITION: return [] diff --git a/codebase_rag/parsers/handlers/registry.py b/codebase_rag/parsers/handlers/registry.py index a886d7f9e..6f490700e 100644 --- a/codebase_rag/parsers/handlers/registry.py +++ b/codebase_rag/parsers/handlers/registry.py @@ -8,6 +8,7 @@ from .java import JavaHandler from .js_ts import JsTsHandler from .lua import LuaHandler +from .php import PhpHandler from .protocol import LanguageHandler from .python import PythonHandler from .rust import RustHandler @@ -20,6 +21,7 @@ SupportedLanguage.RUST: RustHandler, SupportedLanguage.JAVA: JavaHandler, SupportedLanguage.LUA: LuaHandler, + SupportedLanguage.PHP: PhpHandler, } _DEFAULT_HANDLER = BaseLanguageHandler diff --git a/codebase_rag/parsers/handlers/rust.py b/codebase_rag/parsers/handlers/rust.py index 650bec974..7ca6b2f5c 100644 --- a/codebase_rag/parsers/handlers/rust.py +++ b/codebase_rag/parsers/handlers/rust.py @@ -17,6 +17,8 @@ class RustHandler(BaseLanguageHandler): + __slots__ = () + def extract_decorators(self, node: ASTNode) -> list[str]: outer_decorators: list[str] = [] sibling = node.prev_named_sibling diff --git a/codebase_rag/parsers/import_processor.py b/codebase_rag/parsers/import_processor.py index 99c3a8526..317ad2114 100644 --- a/codebase_rag/parsers/import_processor.py +++ b/codebase_rag/parsers/import_processor.py @@ -1,3 +1,4 @@ +from functools import lru_cache from pathlib import Path from loguru import logger @@ -19,10 +20,26 @@ load_persistent_cache, save_persistent_cache, ) -from .utils import get_query_cursor, safe_decode_text, safe_decode_with_fallback +from .utils import ( + get_query_cursor, + safe_decode_text, + safe_decode_with_fallback, + sorted_captures, +) class ImportProcessor: + __slots__ = ( + "repo_path", + "project_name", + "ingestor", + "function_registry", + "import_mapping", + "stdlib_extractor", + "_is_local_module_cached", + "_is_local_java_import_cached", + ) + def __init__( self, repo_path: Path, @@ -39,6 +56,22 @@ def __init__( function_registry, repo_path, project_name ) + @lru_cache(maxsize=4096) + def _is_local_module_cached(module_name: str) -> bool: + return ( + (repo_path / module_name).is_dir() + or (repo_path / f"{module_name}{cs.EXT_PY}").is_file() + or (repo_path / module_name / cs.INIT_PY).is_file() + ) + + @lru_cache(maxsize=4096) + def _is_local_java_import_cached(import_path: str) -> bool: + top_level = import_path.split(cs.SEPARATOR_DOT)[0] + return (repo_path / top_level).is_dir() + + self._is_local_module_cached = _is_local_module_cached + self._is_local_java_import_cached = _is_local_java_import_cached + load_persistent_cache() def __del__(self) -> None: @@ -78,7 +111,7 @@ def parse_imports( try: cursor = get_query_cursor(imports_query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) match language: case cs.SupportedLanguage.PYTHON: @@ -95,13 +128,15 @@ def parse_imports( self._parse_cpp_imports(captures, module_qn) case cs.SupportedLanguage.LUA: self._parse_lua_imports(captures, module_qn) + case cs.SupportedLanguage.PHP: + self._parse_php_imports(captures, module_qn) case _: self._parse_generic_imports(captures, module_qn, lang_config) logger.debug( - ls.IMP_PARSED_COUNT.format( - count=len(self.import_mapping[module_qn]), module=module_qn - ) + ls.IMP_PARSED_COUNT, + count=len(self.import_mapping[module_qn]), + module=module_qn, ) if self.ingestor: @@ -124,15 +159,14 @@ def parse_imports( ), ) logger.debug( - ls.IMP_CREATED_RELATIONSHIP.format( - from_module=module_qn, - to_module=module_path, - full_name=full_name, - ) + ls.IMP_CREATED_RELATIONSHIP, + from_module=module_qn, + to_module=module_path, + full_name=full_name, ) except Exception as e: - logger.warning(ls.IMP_PARSE_FAILED.format(module=module_qn, error=e)) + logger.warning(ls.IMP_PARSE_FAILED, module=module_qn, error=e) def _parse_python_imports(self, captures: dict, module_qn: str) -> None: all_imports = captures.get(cs.CAPTURE_IMPORT, []) + captures.get( @@ -159,7 +193,7 @@ def _handle_dotted_name_import(self, child: Node, module_qn: str) -> None: local_name = module_name.split(cs.SEPARATOR_DOT)[0] full_name = self._resolve_import_full_name(module_name, local_name) self.import_mapping[module_qn][local_name] = full_name - logger.debug(ls.IMP_IMPORT.format(local=local_name, full=full_name)) + logger.debug(ls.IMP_IMPORT, local=local_name, full=full_name) def _handle_aliased_import(self, child: Node, module_qn: str) -> None: module_name_node = child.child_by_field_name(cs.FIELD_NAME) @@ -175,7 +209,7 @@ def _handle_aliased_import(self, child: Node, module_qn: str) -> None: top_level = module_name.split(cs.SEPARATOR_DOT)[0] full_name = self._resolve_import_full_name(module_name, top_level) self.import_mapping[module_qn][alias] = full_name - logger.debug(ls.IMP_ALIASED_IMPORT.format(alias=alias, full=full_name)) + logger.debug(ls.IMP_ALIASED_IMPORT, alias=alias, full=full_name) def _resolve_import_full_name(self, module_name: str, top_level: str) -> str: if self._is_local_module(top_level): @@ -183,15 +217,10 @@ def _resolve_import_full_name(self, module_name: str, top_level: str) -> str: return module_name def _is_local_module(self, module_name: str) -> bool: - return ( - (self.repo_path / module_name).is_dir() - or (self.repo_path / f"{module_name}{cs.EXT_PY}").is_file() - or (self.repo_path / module_name / cs.INIT_PY).is_file() - ) + return self._is_local_module_cached(module_name) def _is_local_java_import(self, import_path: str) -> bool: - top_level = import_path.split(cs.SEPARATOR_DOT)[0] - return (self.repo_path / top_level).is_dir() + return self._is_local_java_import_cached(import_path) def _resolve_java_import_path(self, import_path: str) -> str: if self._is_local_java_import(import_path): @@ -364,13 +393,13 @@ def _register_python_from_imports( if is_wildcard: wildcard_key = f"*{base_module}" self.import_mapping[module_qn][wildcard_key] = base_module - logger.debug(ls.IMP_WILDCARD_IMPORT.format(module=base_module)) + logger.debug(ls.IMP_WILDCARD_IMPORT, module=base_module) return for local_name, original_name in imported_items: full_name = f"{base_module}{cs.SEPARATOR_DOT}{original_name}" self.import_mapping[module_qn][local_name] = full_name - logger.debug(ls.IMP_FROM_IMPORT.format(local=local_name, full=full_name)) + logger.debug(ls.IMP_FROM_IMPORT, local=local_name, full=full_name) def _resolve_relative_import(self, relative_node: Node, module_qn: str) -> str: module_parts = module_qn.split(cs.SEPARATOR_DOT)[1:] @@ -446,7 +475,7 @@ def _parse_js_import_clause( f"{source_module}{cs.IMPORT_DEFAULT_SUFFIX}" ) logger.debug( - ls.IMP_JS_DEFAULT.format(name=imported_name, module=source_module) + ls.IMP_JS_DEFAULT, name=imported_name, module=source_module ) elif child.type == cs.TS_NAMED_IMPORTS: @@ -465,11 +494,10 @@ def _parse_js_import_clause( f"{source_module}{cs.SEPARATOR_DOT}{imported_name}" ) logger.debug( - ls.IMP_JS_NAMED.format( - local=local_name, - module=source_module, - name=imported_name, - ) + ls.IMP_JS_NAMED, + local=local_name, + module=source_module, + name=imported_name, ) elif child.type == cs.TS_NAMESPACE_IMPORT: @@ -480,9 +508,9 @@ def _parse_js_import_clause( source_module ) logger.debug( - ls.IMP_JS_NAMESPACE.format( - name=namespace_name, module=source_module - ) + ls.IMP_JS_NAMESPACE, + name=namespace_name, + module=source_module, ) break @@ -521,9 +549,9 @@ def _parse_js_require(self, decl_node: Node, current_module: str) -> None: resolved_module ) logger.debug( - ls.IMP_JS_REQUIRE.format( - var=var_name, module=resolved_module - ) + ls.IMP_JS_REQUIRE, + var=var_name, + module=resolved_module, ) break @@ -544,7 +572,7 @@ def _parse_js_reexport(self, export_node: Node, current_module: str) -> None: if child.type == cs.TS_ASTERISK: wildcard_key = f"*{source_module}" self.import_mapping[current_module][wildcard_key] = source_module - logger.debug(ls.IMP_JS_NAMESPACE_REEXPORT.format(module=source_module)) + logger.debug(ls.IMP_JS_NAMESPACE_REEXPORT, module=source_module) elif child.type == cs.TS_EXPORT_CLAUSE: for grandchild in child.children: if grandchild.type == cs.TS_EXPORT_SPECIFIER: @@ -561,11 +589,10 @@ def _parse_js_reexport(self, export_node: Node, current_module: str) -> None: f"{source_module}{cs.SEPARATOR_DOT}{original_name}" ) logger.debug( - ls.IMP_JS_REEXPORT.format( - exported=exported_name, - module=source_module, - original=original_name, - ) + ls.IMP_JS_REEXPORT, + exported=exported_name, + module=source_module, + original=original_name, ) def _parse_java_imports(self, captures: dict, module_qn: str) -> None: @@ -589,22 +616,22 @@ def _parse_java_imports(self, captures: dict, module_qn: str) -> None: resolved_path = self._resolve_java_import_path(imported_path) if is_wildcard: - logger.debug(ls.IMP_JAVA_WILDCARD.format(path=resolved_path)) + logger.debug(ls.IMP_JAVA_WILDCARD, path=resolved_path) self.import_mapping[module_qn][f"*{resolved_path}"] = resolved_path elif parts := resolved_path.split(cs.SEPARATOR_DOT): imported_name = parts[-1] self.import_mapping[module_qn][imported_name] = resolved_path if is_static: logger.debug( - ls.IMP_JAVA_STATIC.format( - name=imported_name, path=resolved_path - ) + ls.IMP_JAVA_STATIC, + name=imported_name, + path=resolved_path, ) else: logger.debug( - ls.IMP_JAVA_IMPORT.format( - name=imported_name, path=resolved_path - ) + ls.IMP_JAVA_IMPORT, + name=imported_name, + path=resolved_path, ) def _parse_rust_imports(self, captures: dict, module_qn: str) -> None: @@ -617,7 +644,7 @@ def _parse_rust_use_declaration(self, use_node: Node, module_qn: str) -> None: for imported_name, full_path in imports.items(): self.import_mapping[module_qn][imported_name] = full_path - logger.debug(ls.IMP_RUST.format(name=imported_name, path=full_path)) + logger.debug(ls.IMP_RUST, name=imported_name, path=full_path) def _parse_go_imports(self, captures: dict, module_qn: str) -> None: for import_node in captures.get(cs.CAPTURE_IMPORT, []): @@ -646,7 +673,7 @@ def _parse_go_import_spec(self, spec_node: Node, module_qn: str) -> None: if import_path: package_name = alias_name or import_path.split(cs.SEPARATOR_SLASH)[-1] self.import_mapping[module_qn][package_name] = import_path - logger.debug(ls.IMP_GO.format(package=package_name, path=import_path)) + logger.debug(ls.IMP_GO, package=package_name, path=import_path) def _parse_cpp_imports(self, captures: dict, module_qn: str) -> None: for import_node in captures.get(cs.CAPTURE_IMPORT, []): @@ -692,9 +719,10 @@ def _parse_cpp_include(self, include_node: Node, module_qn: str) -> None: self.import_mapping[module_qn][local_name] = full_name logger.debug( - ls.IMP_CPP_INCLUDE.format( - local=local_name, full=full_name, system=is_system_include - ) + ls.IMP_CPP_INCLUDE, + local=local_name, + full=full_name, + system=is_system_include, ) def _parse_cpp_module_import(self, import_node: Node, module_qn: str) -> None: @@ -727,7 +755,7 @@ def _parse_cpp_module_import(self, import_node: Node, module_qn: str) -> None: full_name = f"{cs.IMPORT_STD_PREFIX}{module_name}" self.import_mapping[module_qn][local_name] = full_name - logger.debug(ls.IMP_CPP_MODULE.format(local=local_name, full=full_name)) + logger.debug(ls.IMP_CPP_MODULE, local=local_name, full=full_name) def _parse_cpp_module_declaration(self, decl_node: Node, module_qn: str) -> None: decoded_text = safe_decode_text(decl_node) @@ -757,9 +785,9 @@ def _parse_cpp_module_declaration(self, decl_node: Node, module_qn: str) -> None full_name = f"{self.project_name}{cs.SEPARATOR_DOT}{partition_part}" self.import_mapping[module_qn][partition_name] = full_name logger.debug( - ls.IMP_CPP_PARTITION.format( - partition=partition_name, full=full_name - ) + ls.IMP_CPP_PARTITION, + partition=partition_name, + full=full_name, ) def _register_cpp_module_mapping( @@ -769,16 +797,74 @@ def _register_cpp_module_mapping( self.import_mapping[module_qn][module_name] = ( f"{self.project_name}{cs.SEPARATOR_DOT}{module_name}" ) - logger.debug(log_template.format(name=module_name)) + logger.debug(log_template, name=module_name) + + _PHP_INCLUDE_REQUIRE_TYPES = frozenset( + { + cs.TS_PHP_INCLUDE_EXPRESSION, + cs.TS_PHP_INCLUDE_ONCE_EXPRESSION, + cs.TS_PHP_REQUIRE_EXPRESSION, + cs.TS_PHP_REQUIRE_ONCE_EXPRESSION, + } + ) + + def _parse_php_imports(self, captures: dict, module_qn: str) -> None: + all_imports = captures.get(cs.CAPTURE_IMPORT, []) + captures.get( + cs.CAPTURE_IMPORT_FROM, [] + ) + for import_node in all_imports: + if import_node.type == cs.TS_PHP_NAMESPACE_USE_DECLARATION: + self._handle_php_use_declaration(import_node, module_qn) + elif import_node.type in self._PHP_INCLUDE_REQUIRE_TYPES: + self._handle_php_include_require(import_node, module_qn) + + def _handle_php_use_declaration(self, use_node: Node, module_qn: str) -> None: + for child in use_node.named_children: + if child.type != cs.TS_PHP_NAMESPACE_USE_CLAUSE: + continue + qn_node = next( + (c for c in child.named_children if c.type == cs.TS_PHP_QUALIFIED_NAME), + None, + ) + if not qn_node: + continue + imported_path = safe_decode_with_fallback(qn_node) + if not imported_path: + continue + imported_path = imported_path.replace("\\", cs.SEPARATOR_DOT) + alias_node = child.child_by_field_name("alias") + if alias_node and alias_node.text: + local_name = safe_decode_with_fallback(alias_node) + else: + parts = imported_path.split(cs.SEPARATOR_DOT) + local_name = parts[-1] if parts else imported_path + self.import_mapping[module_qn][local_name] = imported_path + + def _handle_php_include_require(self, node: Node, module_qn: str) -> None: + for child in node.children: + if child.type in {"string", "encapsed_string"}: + raw = safe_decode_with_fallback(child) + if not raw: + continue + path_str = raw.strip("'\"") + path_str = path_str.replace("/", cs.SEPARATOR_DOT).replace( + "\\", cs.SEPARATOR_DOT + ) + if path_str.endswith(".php"): + path_str = path_str[:-4] + parts = path_str.split(cs.SEPARATOR_DOT) + local_name = parts[-1] if parts else path_str + self.import_mapping[module_qn][local_name] = path_str + return def _parse_generic_imports( self, captures: dict, module_qn: str, lang_config: LanguageSpec ) -> None: for import_node in captures.get(cs.CAPTURE_IMPORT, []): logger.debug( - ls.IMP_GENERIC.format( - language=lang_config.language, node_type=import_node.type - ) + ls.IMP_GENERIC, + language=lang_config.language, + node_type=import_node.type, ) def _parse_lua_imports(self, captures: dict, module_qn: str) -> None: diff --git a/codebase_rag/parsers/java/method_resolver.py b/codebase_rag/parsers/java/method_resolver.py index 01bd25cae..54222c925 100644 --- a/codebase_rag/parsers/java/method_resolver.py +++ b/codebase_rag/parsers/java/method_resolver.py @@ -8,6 +8,7 @@ from ... import constants as cs from ... import logs as ls +from ...decorators import recursion_guard from ...types_defs import ASTNode, NodeType from ..utils import safe_decode_text from .utils import extract_method_call_info, get_class_context_from_qn @@ -20,6 +21,7 @@ class JavaMethodResolverMixin: + __slots__ = () import_processor: ImportProcessor function_registry: FunctionRegistryTrieProtocol project_name: str @@ -202,6 +204,10 @@ def _is_matching_method(self, member: str, method_name: str) -> bool: or member == f"{method_name}{cs.EMPTY_PARENS}" ) + @recursion_guard( + key_func=lambda self, class_qn, *_, **__: class_qn, + guard_name=cs.GUARD_INHERITED_METHOD, + ) def _find_inherited_method( self, class_qn: str, method_name: str, module_qn: str ) -> tuple[str, str] | None: @@ -235,8 +241,10 @@ def _resolve_java_method_return_type( parts = method_call.split(cs.SEPARATOR_DOT) if len(parts) < 2: method_name = method_call - if current_class_qn := self._get_current_class_name(module_qn): - return self._find_method_return_type(current_class_qn, method_name) + if (current_class_qn := self._get_current_class_name(module_qn)) and ( + result := self._find_method_return_type(current_class_qn, method_name) + ): + return result else: object_part = cs.SEPARATOR_DOT.join(parts[:-1]) method_name = parts[-1] @@ -348,34 +356,32 @@ def _do_resolve_java_method_call( logger.debug(ls.JAVA_NO_METHOD_NAME) return None - logger.debug( - ls.JAVA_RESOLVING_CALL.format(method=method_name, object=object_ref) - ) + logger.debug(ls.JAVA_RESOLVING_CALL, method=method_name, object=object_ref) if not object_ref: - logger.debug(ls.JAVA_RESOLVING_STATIC.format(method=method_name)) + logger.debug(ls.JAVA_RESOLVING_STATIC, method=method_name) result = self._resolve_static_or_local_method(str(method_name), module_qn) if result: - logger.debug(ls.JAVA_FOUND_STATIC.format(result=result)) + logger.debug(ls.JAVA_FOUND_STATIC, result=result) else: - logger.debug(ls.JAVA_STATIC_NOT_FOUND.format(method=method_name)) + logger.debug(ls.JAVA_STATIC_NOT_FOUND, method=method_name) return result - logger.debug(ls.JAVA_RESOLVING_OBJ_TYPE.format(object=object_ref)) + logger.debug(ls.JAVA_RESOLVING_OBJ_TYPE, object=object_ref) if not ( object_type := self._resolve_java_object_type( str(object_ref), local_var_types, module_qn ) ): - logger.debug(ls.JAVA_OBJ_TYPE_UNKNOWN.format(object=object_ref)) + logger.debug(ls.JAVA_OBJ_TYPE_UNKNOWN, object=object_ref) return None - logger.debug(ls.JAVA_OBJ_TYPE_RESOLVED.format(type=object_type)) + logger.debug(ls.JAVA_OBJ_TYPE_RESOLVED, type=object_type) result = self._resolve_instance_method(object_type, str(method_name), module_qn) if result: - logger.debug(ls.JAVA_FOUND_INSTANCE.format(result=result)) + logger.debug(ls.JAVA_FOUND_INSTANCE, result=result) else: logger.debug( - ls.JAVA_INSTANCE_NOT_FOUND.format(type=object_type, method=method_name) + ls.JAVA_INSTANCE_NOT_FOUND, type=object_type, method=method_name ) return result diff --git a/codebase_rag/parsers/java/type_inference.py b/codebase_rag/parsers/java/type_inference.py index 8fd86a7d2..9cb77e657 100644 --- a/codebase_rag/parsers/java/type_inference.py +++ b/codebase_rag/parsers/java/type_inference.py @@ -26,6 +26,21 @@ class JavaTypeInferenceEngine( JavaVariableAnalyzerMixin, JavaMethodResolverMixin, ): + __slots__ = ( + "import_processor", + "function_registry", + "repo_path", + "project_name", + "ast_cache", + "queries", + "module_qn_to_file_path", + "class_inheritance", + "simple_name_lookup", + "_lookup_cache", + "_lookup_in_progress", + "_fqn_to_module_qn", + ) + def __init__( self, import_processor: ImportProcessor, @@ -83,10 +98,10 @@ def build_variable_type_map( try: self._collect_all_variable_types(scope_node, local_var_types, module_qn) - logger.debug(ls.JAVA_VAR_TYPE_MAP_BUILT.format(count=len(local_var_types))) + logger.debug(ls.JAVA_VAR_TYPE_MAP_BUILT, count=len(local_var_types)) except Exception as e: - logger.error(ls.JAVA_VAR_TYPE_MAP_FAILED.format(error=e)) + logger.error(ls.JAVA_VAR_TYPE_MAP_FAILED, error=e) return local_var_types diff --git a/codebase_rag/parsers/java/type_resolver.py b/codebase_rag/parsers/java/type_resolver.py index cbb69fcf7..f1827e6e5 100644 --- a/codebase_rag/parsers/java/type_resolver.py +++ b/codebase_rag/parsers/java/type_resolver.py @@ -20,6 +20,7 @@ class JavaTypeResolverMixin: + __slots__ = () import_processor: ImportProcessor function_registry: FunctionRegistryTrieProtocol module_qn_to_file_path: dict[str, Path] diff --git a/codebase_rag/parsers/java/variable_analyzer.py b/codebase_rag/parsers/java/variable_analyzer.py index 65003d9bb..89057821e 100644 --- a/codebase_rag/parsers/java/variable_analyzer.py +++ b/codebase_rag/parsers/java/variable_analyzer.py @@ -23,6 +23,7 @@ class JavaVariableAnalyzerMixin: + __slots__ = () ast_cache: ASTCacheProtocol module_qn_to_file_path: dict[str, Path] _lookup_cache: dict[str, str | None] @@ -84,7 +85,7 @@ def _process_formal_parameter( if param_name and param_type: resolved_type = self._resolve_java_type_name(param_type, module_qn) local_var_types[param_name] = resolved_type - logger.debug(ls.JAVA_PARAM.format(name=param_name, type=resolved_type)) + logger.debug(ls.JAVA_PARAM, name=param_name, type=resolved_type) def _process_spread_parameter( self, param_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -103,9 +104,7 @@ def _process_spread_parameter( if param_name and param_type: resolved_type = self._resolve_java_type_name(param_type, module_qn) local_var_types[param_name] = resolved_type - logger.debug( - ls.JAVA_VARARGS_PARAM.format(name=param_name, type=resolved_type) - ) + logger.debug(ls.JAVA_VARARGS_PARAM, name=param_name, type=resolved_type) def _analyze_java_local_variables( self, scope_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -164,15 +163,13 @@ def _process_variable_declarator( resolved_type = self._resolve_java_type_name(inferred_type, module_qn) local_var_types[var_name] = resolved_type logger.debug( - ls.JAVA_LOCAL_VAR_INFERRED.format(name=var_name, type=resolved_type) + ls.JAVA_LOCAL_VAR_INFERRED, name=var_name, type=resolved_type ) return resolved_type = self._resolve_java_type_name(declared_type, module_qn) local_var_types[var_name] = resolved_type - logger.debug( - ls.JAVA_LOCAL_VAR_DECLARED.format(name=var_name, type=resolved_type) - ) + logger.debug(ls.JAVA_LOCAL_VAR_DECLARED, name=var_name, type=resolved_type) def _analyze_java_class_fields( self, scope_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -201,7 +198,7 @@ def _analyze_java_class_fields( if str(field_name) not in local_var_types: local_var_types[str(field_name)] = resolved_type logger.debug( - ls.JAVA_CLASS_FIELD.format(name=field_name, type=resolved_type) + ls.JAVA_CLASS_FIELD, name=field_name, type=resolved_type ) def _analyze_java_constructor_assignments( @@ -235,7 +232,7 @@ def _process_java_assignment( ): resolved_type = self._resolve_java_type_name(inferred_type, module_qn) local_var_types[var_name] = resolved_type - logger.debug(ls.JAVA_ASSIGNMENT.format(name=var_name, type=resolved_type)) + logger.debug(ls.JAVA_ASSIGNMENT, name=var_name, type=resolved_type) def _extract_java_variable_reference(self, node: ASTNode) -> str | None: match node.type: @@ -297,9 +294,7 @@ def _register_for_loop_variable( ): resolved_type = self._resolve_java_type_name(var_type, module_qn) local_var_types[var_name] = resolved_type - logger.debug( - ls.JAVA_ENHANCED_FOR_VAR.format(name=var_name, type=resolved_type) - ) + logger.debug(ls.JAVA_ENHANCED_FOR_VAR, name=var_name, type=resolved_type) def _extract_for_loop_variable_from_children( self, for_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -325,9 +320,9 @@ def _extract_for_loop_variable_from_children( ) local_var_types[var_name] = resolved_type logger.debug( - ls.JAVA_ENHANCED_FOR_VAR_ALT.format( - name=var_name, type=resolved_type - ) + ls.JAVA_ENHANCED_FOR_VAR_ALT, + name=var_name, + type=resolved_type, ) break diff --git a/codebase_rag/parsers/js_ts/ingest.py b/codebase_rag/parsers/js_ts/ingest.py index 30580e184..11e516c91 100644 --- a/codebase_rag/parsers/js_ts/ingest.py +++ b/codebase_rag/parsers/js_ts/ingest.py @@ -16,7 +16,7 @@ PropertyDict, SimpleNameLookup, ) -from ..utils import safe_decode_text, safe_decode_with_fallback +from ..utils import safe_decode_text, safe_decode_with_fallback, sorted_captures from .module_system import JsTsModuleSystemMixin from .utils import get_js_ts_language_obj @@ -29,6 +29,7 @@ class JsTsIngestMixin(JsTsModuleSystemMixin): + __slots__ = () ingestor: IngestorProtocol repo_path: Path project_name: str @@ -88,14 +89,14 @@ def _ingest_prototype_inheritance_links( language_obj, root_node, module_qn ) except Exception as e: - logger.debug(lg.JS_PROTOTYPE_INHERITANCE_FAILED.format(error=e)) + logger.debug(lg.JS_PROTOTYPE_INHERITANCE_FAILED, error=e) def _process_prototype_inheritance_captures( self, language_obj, root_node, module_qn ): query = Query(language_obj, cs.JS_PROTOTYPE_INHERITANCE_QUERY) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) child_classes = captures.get(cs.CAPTURE_CHILD_CLASS, []) parent_classes = captures.get(cs.CAPTURE_PARENT_CLASS, []) @@ -122,9 +123,7 @@ def _process_prototype_inheritance_captures( ) logger.debug( - lg.JS_PROTOTYPE_INHERITANCE.format( - child_qn=child_qn, parent_qn=parent_qn - ) + lg.JS_PROTOTYPE_INHERITANCE, child_qn=child_qn, parent_qn=parent_qn ) def _ingest_prototype_method_assignments( @@ -143,12 +142,12 @@ def _ingest_prototype_method_assignments( try: self._process_prototype_method_captures(language_obj, root_node, module_qn) except Exception as e: - logger.debug(lg.JS_PROTOTYPE_METHODS_FAILED.format(error=e)) + logger.debug(lg.JS_PROTOTYPE_METHODS_FAILED, error=e) def _process_prototype_method_captures(self, language_obj, root_node, module_qn): method_query = Query(language_obj, cs.JS_PROTOTYPE_METHOD_QUERY) method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(root_node) + method_captures = sorted_captures(method_cursor, root_node) constructor_names = method_captures.get(cs.CAPTURE_CONSTRUCTOR_NAME, []) method_names = method_captures.get(cs.CAPTURE_METHOD_NAME, []) @@ -174,9 +173,9 @@ def _process_prototype_method_captures(self, language_obj, root_node, module_qn) cs.KEY_DOCSTRING: self._get_docstring(func_node), } logger.info( - lg.JS_PROTOTYPE_METHOD_FOUND.format( - method_name=method_name, method_qn=method_qn - ) + lg.JS_PROTOTYPE_METHOD_FOUND, + method_name=method_name, + method_qn=method_qn, ) self.ingestor.ensure_node_batch(cs.NodeLabel.FUNCTION, method_props) @@ -190,9 +189,9 @@ def _process_prototype_method_captures(self, language_obj, root_node, module_qn) ) logger.debug( - lg.JS_PROTOTYPE_METHOD_DEFINES.format( - constructor_qn=constructor_qn, method_qn=method_qn - ) + lg.JS_PROTOTYPE_METHOD_DEFINES, + constructor_qn=constructor_qn, + method_qn=method_qn, ) def _ingest_object_literal_methods( @@ -213,7 +212,7 @@ def _ingest_object_literal_methods( language_obj, query_text, root_node, module_qn, lang_config ) except Exception as e: - logger.debug(lg.JS_OBJECT_METHODS_DETECT_FAILED.format(error=e)) + logger.debug(lg.JS_OBJECT_METHODS_DETECT_FAILED, error=e) def _process_object_method_query( self, @@ -226,7 +225,7 @@ def _process_object_method_query( try: query = Query(language_obj, query_text) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) method_names = captures.get(cs.CAPTURE_METHOD_NAME, []) method_functions = captures.get(cs.CAPTURE_METHOD_FUNCTION, []) @@ -250,7 +249,7 @@ def _process_object_method_query( method_name_node, method_func_node, module_qn, lang_config ) except Exception as e: - logger.debug(lg.JS_OBJECT_METHODS_PROCESS_FAILED.format(error=e)) + logger.debug(lg.JS_OBJECT_METHODS_PROCESS_FAILED, error=e) def _process_single_object_method( self, @@ -314,9 +313,7 @@ def _register_object_method( cs.KEY_DOCSTRING: self._get_docstring(method_func_node), } logger.info( - lg.JS_OBJECT_METHOD_FOUND.format( - method_name=method_name, method_qn=method_qn - ) + lg.JS_OBJECT_METHOD_FOUND, method_name=method_name, method_qn=method_qn ) self.ingestor.ensure_node_batch(cs.NodeLabel.FUNCTION, method_props) @@ -352,7 +349,7 @@ def _ingest_assignment_arrow_functions( lang_query, query_text, root_node, module_qn, lang_config ) except Exception as e: - logger.debug(lg.JS_ASSIGNMENT_ARROW_DETECT_FAILED.format(error=e)) + logger.debug(lg.JS_ASSIGNMENT_ARROW_DETECT_FAILED, error=e) def _process_arrow_query( self, @@ -365,7 +362,7 @@ def _process_arrow_query( try: query = Query(lang_query, query_text) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) method_names = captures.get(cs.CAPTURE_METHOD_NAME, []) member_exprs = captures.get(cs.CAPTURE_MEMBER_EXPR, []) @@ -390,7 +387,7 @@ def _process_arrow_query( lg.JS_ASSIGNMENT_FUNC_EXPR_FOUND, ) except Exception as e: - logger.debug(lg.JS_ASSIGNMENT_ARROW_QUERY_FAILED.format(error=e)) + logger.debug(lg.JS_ASSIGNMENT_ARROW_QUERY_FAILED, error=e) def _process_direct_arrow_functions( self, @@ -506,9 +503,7 @@ def _register_arrow_function( cs.KEY_DOCSTRING: self._get_docstring(function_node), } - logger.debug( - log_message.format(function_name=function_name, function_qn=function_qn) - ) + logger.debug(log_message, function_name=function_name, function_qn=function_qn) self.ingestor.ensure_node_batch(cs.NodeLabel.FUNCTION, function_props) self.function_registry[function_qn] = NodeType.FUNCTION self.simple_name_lookup[function_name].add(function_qn) diff --git a/codebase_rag/parsers/js_ts/module_system.py b/codebase_rag/parsers/js_ts/module_system.py index 436603575..8c3a1b6c5 100644 --- a/codebase_rag/parsers/js_ts/module_system.py +++ b/codebase_rag/parsers/js_ts/module_system.py @@ -15,6 +15,7 @@ ingest_exported_function, safe_decode_text, safe_decode_with_fallback, + sorted_captures, ) from .utils import get_js_ts_language_obj @@ -29,6 +30,7 @@ class JsTsModuleSystemMixin: + __slots__ = ("_processed_imports",) ingestor: IngestorProtocol repo_path: Path project_name: str @@ -61,7 +63,7 @@ def _ingest_missing_import_patterns( try: query = Query(language_obj, cs.JS_COMMONJS_DESTRUCTURE_QUERY) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) variable_declarators = captures.get(cs.CAPTURE_VARIABLE_DECLARATOR, []) @@ -71,10 +73,10 @@ def _ingest_missing_import_patterns( ) except Exception as e: - logger.debug(ls.JS_COMMONJS_DESTRUCTURE_FAILED.format(error=e)) + logger.debug(ls.JS_COMMONJS_DESTRUCTURE_FAILED, error=e) except Exception as e: - logger.debug(ls.JS_MISSING_IMPORT_PATTERNS_FAILED.format(error=e)) + logger.debug(ls.JS_MISSING_IMPORT_PATTERNS_FAILED, error=e) def _extract_require_module_name(self, declarator: ASTNode) -> str | None: name_node = declarator.child_by_field_name(cs.FIELD_NAME) @@ -148,7 +150,7 @@ def _process_variable_declarator_for_commonjs( self._process_destructured_child(child, module_name, module_qn) except Exception as e: - logger.debug(ls.JS_COMMONJS_VAR_DECLARATOR_FAILED.format(error=e)) + logger.debug(ls.JS_COMMONJS_VAR_DECLARATOR_FAILED, error=e) def _process_commonjs_import( self, imported_name: str, module_name: str, module_qn: str @@ -179,20 +181,17 @@ def _process_commonjs_import( ) logger.debug( - ls.JS_MISSING_IMPORT_PATTERN.format( - module_qn=module_qn, - imported_name=imported_name, - resolved_source_module=resolved_source_module, - ) + ls.JS_MISSING_IMPORT_PATTERN, + module_qn=module_qn, + imported_name=imported_name, + resolved_source_module=resolved_source_module, ) self._processed_imports.add(import_key) except Exception as e: logger.debug( - ls.JS_COMMONJS_IMPORT_FAILED.format( - imported_name=imported_name, error=e - ) + ls.JS_COMMONJS_IMPORT_FAILED, imported_name=imported_name, error=e ) def _ingest_export_function( @@ -282,9 +281,8 @@ def _ingest_commonjs_exports( for query_text in query_texts: try: - captures = QueryCursor(Query(language_obj, query_text)).captures( - root_node - ) + cursor = QueryCursor(Query(language_obj, query_text)) + captures = sorted_captures(cursor, root_node) self._process_exports_pattern( captures.get(cs.CAPTURE_EXPORTS_OBJ, []), @@ -302,7 +300,7 @@ def _ingest_commonjs_exports( ) except Exception as e: - logger.debug(ls.JS_COMMONJS_EXPORTS_QUERY_FAILED.format(error=e)) + logger.debug(ls.JS_COMMONJS_EXPORTS_QUERY_FAILED, error=e) def _ingest_es6_exports( self, @@ -322,7 +320,7 @@ def _ingest_es6_exports( cleaned_query = textwrap.dedent(query_text).strip() query = Query(lang_query, cleaned_query) cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) export_names = captures.get(cs.CAPTURE_EXPORT_NAME, []) export_functions = captures.get(cs.CAPTURE_EXPORT_FUNCTION, []) @@ -365,7 +363,7 @@ def _ingest_es6_exports( ) except Exception as e: - logger.debug(ls.JS_ES6_EXPORTS_QUERY_FAILED.format(error=e)) + logger.debug(ls.JS_ES6_EXPORTS_QUERY_FAILED, error=e) except Exception as e: - logger.debug(ls.JS_ES6_EXPORTS_DETECT_FAILED.format(error=e)) + logger.debug(ls.JS_ES6_EXPORTS_DETECT_FAILED, error=e) diff --git a/codebase_rag/parsers/js_ts/type_inference.py b/codebase_rag/parsers/js_ts/type_inference.py index e4930e365..29a435c77 100644 --- a/codebase_rag/parsers/js_ts/type_inference.py +++ b/codebase_rag/parsers/js_ts/type_inference.py @@ -11,6 +11,13 @@ class JsTypeInferenceEngine: + __slots__ = ( + "import_processor", + "function_registry", + "project_name", + "_find_method_ast_node", + ) + def __init__( self, import_processor: ImportProcessor, @@ -46,9 +53,9 @@ def build_local_variable_type_map( var_name = safe_decode_text(name_node) if var_name is not None: logger.debug( - ls.JS_VAR_DECLARATOR_FOUND.format( - var_name=var_name, module_qn=module_qn - ) + ls.JS_VAR_DECLARATOR_FOUND, + var_name=var_name, + module_qn=module_qn, ) if var_type := self._infer_js_variable_type_from_value( @@ -56,28 +63,26 @@ def build_local_variable_type_map( ): local_var_types[var_name] = var_type logger.debug( - ls.JS_VAR_INFERRED.format( - var_name=var_name, var_type=var_type - ) + ls.JS_VAR_INFERRED, + var_name=var_name, + var_type=var_type, ) else: - logger.debug( - ls.JS_VAR_INFER_FAILED.format(var_name=var_name) - ) + logger.debug(ls.JS_VAR_INFER_FAILED, var_name=var_name) stack.extend(reversed(current.children)) logger.debug( - ls.JS_VAR_TYPE_MAP_BUILT.format( - count=len(local_var_types), declarator_count=declarator_count - ) + ls.JS_VAR_TYPE_MAP_BUILT, + count=len(local_var_types), + declarator_count=declarator_count, ) return local_var_types def _infer_js_variable_type_from_value( self, value_node: ASTNode, module_qn: str ) -> str | None: - logger.debug(ls.JS_INFER_VALUE_NODE.format(node_type=value_node.type)) + logger.debug(ls.JS_INFER_VALUE_NODE, node_type=value_node.type) if value_node.type == cs.TS_NEW_EXPRESSION: if class_name := ut.extract_constructor_name(value_node): @@ -87,28 +92,23 @@ def _infer_js_variable_type_from_value( elif value_node.type == cs.TS_CALL_EXPRESSION: func_node = value_node.child_by_field_name("function") func_type = func_node.type if func_node else cs.STR_NONE - logger.debug(ls.JS_CALL_EXPR_FUNC_NODE.format(func_type=func_type)) + logger.debug(ls.JS_CALL_EXPR_FUNC_NODE, func_type=func_type) if func_node and func_node.type == cs.TS_MEMBER_EXPRESSION: method_call_text = ut.extract_method_call(func_node) - logger.debug( - ls.JS_EXTRACTED_METHOD_CALL.format(method_call=method_call_text) - ) + logger.debug(ls.JS_EXTRACTED_METHOD_CALL, method_call=method_call_text) if method_call_text: if inferred_type := self._infer_js_method_return_type( method_call_text, module_qn ): logger.debug( - ls.JS_TYPE_INFERRED.format( - method_call=method_call_text, - inferred_type=inferred_type, - ) + ls.JS_TYPE_INFERRED, + method_call=method_call_text, + inferred_type=inferred_type, ) return inferred_type logger.debug( - ls.JS_RETURN_TYPE_INFER_FAILED.format( - method_call=method_call_text - ) + ls.JS_RETURN_TYPE_INFER_FAILED, method_call=method_call_text ) elif func_node and func_node.type == cs.TS_IDENTIFIER: @@ -116,7 +116,7 @@ def _infer_js_variable_type_from_value( if func_name: return safe_decode_text(func_node) - logger.debug(ls.JS_NO_PATTERN_MATCHED.format(node_type=value_node.type)) + logger.debug(ls.JS_NO_PATTERN_MATCHED, node_type=value_node.type) return None def _infer_js_method_return_type( @@ -124,7 +124,7 @@ def _infer_js_method_return_type( ) -> str | None: parts = method_call.split(cs.SEPARATOR_DOT) if len(parts) != 2: - logger.debug(ls.JS_METHOD_CALL_INVALID.format(method_call=method_call)) + logger.debug(ls.JS_METHOD_CALL_INVALID, method_call=method_call) return None class_name, method_name = parts @@ -132,27 +132,23 @@ def _infer_js_method_return_type( class_qn = self._resolve_js_class_name(class_name, module_qn) if not class_qn: logger.debug( - ls.JS_CLASS_RESOLVE_FAILED.format( - class_name=class_name, module_qn=module_qn - ) + ls.JS_CLASS_RESOLVE_FAILED, class_name=class_name, module_qn=module_qn ) return None - logger.debug( - ls.JS_CLASS_RESOLVED.format(class_name=class_name, class_qn=class_qn) - ) + logger.debug(ls.JS_CLASS_RESOLVED, class_name=class_name, class_qn=class_qn) method_qn = f"{class_qn}{cs.SEPARATOR_DOT}{method_name}" - logger.debug(ls.JS_LOOKING_FOR_METHOD.format(method_qn=method_qn)) + logger.debug(ls.JS_LOOKING_FOR_METHOD, method_qn=method_qn) method_node = self._find_method_ast_node(method_qn) if not method_node: - logger.debug(ls.JS_METHOD_AST_NOT_FOUND.format(method_qn=method_qn)) + logger.debug(ls.JS_METHOD_AST_NOT_FOUND, method_qn=method_qn) return None return_type = self._analyze_return_statements(method_node, method_qn) logger.debug( - ls.JS_RETURN_ANALYZED.format(method_qn=method_qn, return_type=return_type) + ls.JS_RETURN_ANALYZED, method_qn=method_qn, return_type=return_type ) return return_type diff --git a/codebase_rag/parsers/lua/type_inference.py b/codebase_rag/parsers/lua/type_inference.py index 99a5515ba..92b910881 100644 --- a/codebase_rag/parsers/lua/type_inference.py +++ b/codebase_rag/parsers/lua/type_inference.py @@ -14,6 +14,12 @@ class LuaTypeInferenceEngine: + __slots__ = ( + "import_processor", + "function_registry", + "project_name", + ) + def __init__( self, import_processor: ImportProcessor, @@ -36,7 +42,7 @@ def build_local_variable_type_map( self._process_variable_declaration(current, module_qn, local_var_types) stack.extend(reversed(current.children)) - logger.debug(ls.LUA_VAR_TYPE_MAP_BUILT.format(count=len(local_var_types))) + logger.debug(ls.LUA_VAR_TYPE_MAP_BUILT, count=len(local_var_types)) return local_var_types def _process_variable_declaration( @@ -62,9 +68,7 @@ def _process_variable_declaration( func_calls[i], module_qn ): local_var_types[var_name] = var_type - logger.debug( - ls.LUA_VAR_INFERRED.format(var_name=var_name, var_type=var_type) - ) + logger.debug(ls.LUA_VAR_INFERRED, var_name=var_name, var_type=var_type) def _extract_var_names(self, assignment: TreeSitterNodeProtocol) -> list[str]: names: list[str] = [] @@ -110,11 +114,10 @@ def _infer_lua_variable_type_from_value( class_name, module_qn ): logger.debug( - ls.LUA_TYPE_INFERENCE_RETURN.format( - class_name=class_name, - method_name=method_name, - class_qn=class_qn, - ) + ls.LUA_TYPE_INFERENCE_RETURN, + class_name=class_name, + method_name=method_name, + class_qn=class_qn, ) return class_qn diff --git a/codebase_rag/parsers/py/ast_analyzer.py b/codebase_rag/parsers/py/ast_analyzer.py index ec663db4f..b1d4875c0 100644 --- a/codebase_rag/parsers/py/ast_analyzer.py +++ b/codebase_rag/parsers/py/ast_analyzer.py @@ -10,7 +10,7 @@ from ... import logs as lg from ...types_defs import LanguageQueries from ..js_ts.utils import find_method_in_ast as find_js_method_in_ast -from ..utils import safe_decode_text +from ..utils import safe_decode_text, sorted_captures if TYPE_CHECKING: from collections.abc import Callable @@ -45,6 +45,7 @@ def _infer_instance_variable_types_from_assignments( class PythonAstAnalyzerMixin(_AstBase): + __slots__ = () queries: dict[cs.SupportedLanguage, LanguageQueries] module_qn_to_file_path: dict[str, Path] ast_cache: ASTCacheProtocol @@ -140,7 +141,7 @@ def _process_assignment_simple( right_node, module_qn ): local_var_types[var_name] = inferred_type - logger.debug(lg.PY_TYPE_SIMPLE.format(var=var_name, type=inferred_type)) + logger.debug(lg.PY_TYPE_SIMPLE, var=var_name, type=inferred_type) def _process_assignment_complex( self, assignment_node: Node, local_var_types: dict[str, str], module_qn: str @@ -162,7 +163,7 @@ def _process_assignment_complex( right_node, module_qn, local_var_types ): local_var_types[var_name] = inferred_type - logger.debug(lg.PY_TYPE_COMPLEX.format(var=var_name, type=inferred_type)) + logger.debug(lg.PY_TYPE_COMPLEX, var=var_name, type=inferred_type) def _extract_assignment_variable_name(self, node: Node) -> str | None: if node.type != cs.TS_PY_IDENTIFIER or node.text is None: @@ -210,7 +211,7 @@ def _find_python_method_in_ast( if not class_query: return None cursor = QueryCursor(class_query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) method_query = lang_queries[cs.QUERY_KEY_FUNCTIONS] if not method_query: @@ -232,7 +233,7 @@ def _find_python_method_in_ast( continue method_cursor = QueryCursor(method_query) - method_captures = method_cursor.captures(body_node) + method_captures = sorted_captures(method_cursor, body_node) for method_node in method_captures.get(cs.QUERY_CAPTURE_FUNCTION, []): if not isinstance(method_node, Node): @@ -344,13 +345,11 @@ def _analyze_identifier_return(self, expr_node: Node, method_qn: str) -> str | N local_vars = self.build_local_variable_type_map(method_node, module_qn) if identifier in local_vars: logger.debug( - lg.PY_VAR_FROM_CONTEXT.format( - var=identifier, type=local_vars[identifier] - ) + lg.PY_VAR_FROM_CONTEXT, var=identifier, type=local_vars[identifier] ) return local_vars[identifier] - logger.debug(lg.PY_VAR_CANNOT_INFER.format(var=identifier)) + logger.debug(lg.PY_VAR_CANNOT_INFER, var=identifier) return None def _analyze_attribute_return(self, expr_node: Node, method_qn: str) -> str | None: diff --git a/codebase_rag/parsers/py/expression_analyzer.py b/codebase_rag/parsers/py/expression_analyzer.py index 81e0c28a2..19ce80b16 100644 --- a/codebase_rag/parsers/py/expression_analyzer.py +++ b/codebase_rag/parsers/py/expression_analyzer.py @@ -40,6 +40,7 @@ def _analyze_method_return_statements( class PythonExpressionAnalyzerMixin(_ExprBase): + __slots__ = () import_processor: ImportProcessor function_registry: FunctionRegistryTrieProtocol simple_name_lookup: SimpleNameLookup @@ -243,7 +244,7 @@ def _infer_method_return_type( return self._analyze_method_return_statements(method_node, method_qn) return None except Exception as e: - logger.debug(lg.PY_INFER_RETURN_FAILED.format(method=method_call, error=e)) + logger.debug(lg.PY_INFER_RETURN_FAILED, method=method_call, error=e) return None def _resolve_method_qualified_name( @@ -305,11 +306,10 @@ def _resolve_class_method( for qn in self.simple_name_lookup.get(class_name, []): if result := self._try_resolve_method(qn, method_name): logger.debug( - lg.PY_RESOLVED_METHOD.format( - class_name=class_name, - method_name=method_name, - method_qn=result, - ) + lg.PY_RESOLVED_METHOD, + class_name=class_name, + method_name=method_name, + method_qn=result, ) return result @@ -355,7 +355,7 @@ def _try_infer_from_self_assignments( return instance_vars.get(full_attr_name) except Exception as e: - logger.debug(lg.PY_INFER_ATTR_FAILED.format(attr=attribute_name, error=e)) + logger.debug(lg.PY_INFER_ATTR_FAILED, attr=attribute_name, error=e) return None def _find_class_in_scope(self, class_name: str, module_qn: str) -> str | None: diff --git a/codebase_rag/parsers/py/type_inference.py b/codebase_rag/parsers/py/type_inference.py index 5908ee76a..5ba8bc3f2 100644 --- a/codebase_rag/parsers/py/type_inference.py +++ b/codebase_rag/parsers/py/type_inference.py @@ -30,6 +30,21 @@ class PythonTypeInferenceEngine( PythonAstAnalyzerMixin, PythonVariableAnalyzerMixin, ): + __slots__ = ( + "import_processor", + "function_registry", + "repo_path", + "project_name", + "ast_cache", + "queries", + "module_qn_to_file_path", + "class_inheritance", + "simple_name_lookup", + "_js_type_inference_getter", + "_method_return_type_cache", + "_type_inference_in_progress", + ) + def __init__( self, import_processor: ImportProcessor, @@ -68,6 +83,6 @@ def build_local_variable_type_map( self._traverse_single_pass(caller_node, local_var_types, module_qn) except Exception as e: - logger.debug(lg.PY_BUILD_VAR_MAP_FAILED.format(error=e)) + logger.debug(lg.PY_BUILD_VAR_MAP_FAILED, error=e) return local_var_types diff --git a/codebase_rag/parsers/py/variable_analyzer.py b/codebase_rag/parsers/py/variable_analyzer.py index 9a49f9a27..53a55932b 100644 --- a/codebase_rag/parsers/py/variable_analyzer.py +++ b/codebase_rag/parsers/py/variable_analyzer.py @@ -23,6 +23,7 @@ def _infer_type_from_expression( class PythonVariableAnalyzerMixin(_VarBase): + __slots__ = () import_processor: ImportProcessor function_registry: FunctionRegistryTrieProtocol @@ -61,9 +62,7 @@ def _process_untyped_parameter( ): return local_var_types[param_name] = inferred_type - logger.debug( - lg.PY_PARAM_TYPE_INFERRED.format(param=param_name, type=inferred_type) - ) + logger.debug(lg.PY_PARAM_TYPE_INFERRED, param=param_name, type=inferred_type) def _process_typed_parameter( self, param: ASTNode, local_var_types: dict[str, str] @@ -102,11 +101,9 @@ def _process_typed_default_parameter( def _infer_type_from_parameter_name( self, param_name: str, module_qn: str ) -> str | None: - logger.debug( - lg.PY_TYPE_INFER_ATTEMPT.format(param=param_name, module=module_qn) - ) + logger.debug(lg.PY_TYPE_INFER_ATTEMPT, param=param_name, module=module_qn) available_class_names = self._collect_available_classes(module_qn) - logger.debug(lg.PY_AVAILABLE_CLASSES.format(classes=available_class_names)) + logger.debug(lg.PY_AVAILABLE_CLASSES, classes=available_class_names) return self._find_best_class_match(param_name, available_class_names) def _collect_available_classes(self, module_qn: str) -> list[str]: @@ -142,9 +139,7 @@ def _find_best_class_match( best_match = class_name logger.debug( - lg.PY_BEST_MATCH.format( - param=param_name, match=best_match, score=highest_score - ) + lg.PY_BEST_MATCH, param=param_name, match=best_match, score=highest_score ) return best_match @@ -195,9 +190,7 @@ def _infer_loop_var_from_iterable( right_node, local_var_types, module_qn ): local_var_types[loop_var] = element_type - logger.debug( - lg.PY_LOOP_VAR_INFERRED.format(var=loop_var, type=element_type) - ) + logger.debug(lg.PY_LOOP_VAR_INFERRED, var=loop_var, type=element_type) def _infer_iterable_element_type( self, iterable_node: ASTNode, local_var_types: dict[str, str], module_qn: str @@ -256,9 +249,7 @@ def _process_self_assignment( ): return local_var_types[attr_name] = assigned_type - logger.debug( - lg.PY_INSTANCE_VAR_INFERRED.format(attr=attr_name, type=assigned_type) - ) + logger.debug(lg.PY_INSTANCE_VAR_INFERRED, attr=attr_name, type=assigned_type) def _analyze_self_assignments( self, node: ASTNode, local_var_types: dict[str, str], module_qn: str diff --git a/codebase_rag/parsers/stdlib_extractor.py b/codebase_rag/parsers/stdlib_extractor.py index fbcbddd4c..7e073d502 100644 --- a/codebase_rag/parsers/stdlib_extractor.py +++ b/codebase_rag/parsers/stdlib_extractor.py @@ -42,7 +42,7 @@ def _is_tool_available(tool_name: str) -> bool: subprocess.CalledProcessError, ): _EXTERNAL_TOOLS[tool_name] = False - logger.debug(ls.IMP_TOOL_NOT_AVAILABLE.format(tool=tool_name)) + logger.debug(ls.IMP_TOOL_NOT_AVAILABLE, tool=tool_name) return False @@ -77,9 +77,9 @@ def load_persistent_cache() -> None: data = json.load(f) _STDLIB_CACHE.update(data.get(cs.IMPORT_CACHE_KEY, {})) _CACHE_TIMESTAMPS.update(data.get(cs.IMPORT_TIMESTAMPS_KEY, {})) - logger.debug(ls.IMP_CACHE_LOADED.format(path=cache_file)) + logger.debug(ls.IMP_CACHE_LOADED, path=cache_file) except (json.JSONDecodeError, OSError) as e: - logger.debug(ls.IMP_CACHE_LOAD_ERROR.format(error=e)) + logger.debug(ls.IMP_CACHE_LOAD_ERROR, error=e) def save_persistent_cache() -> None: @@ -97,9 +97,9 @@ def save_persistent_cache() -> None: f, indent=2, ) - logger.debug(ls.IMP_CACHE_SAVED.format(path=cache_file)) + logger.debug(ls.IMP_CACHE_SAVED, path=cache_file) except OSError as e: - logger.debug(ls.IMP_CACHE_SAVE_ERROR.format(error=e)) + logger.debug(ls.IMP_CACHE_SAVE_ERROR, error=e) def flush_stdlib_cache() -> None: @@ -115,7 +115,7 @@ def clear_stdlib_cache() -> None: cache_file.unlink() logger.debug(ls.IMP_CACHE_CLEARED) except OSError as e: - logger.debug(ls.IMP_CACHE_CLEAR_ERROR.format(error=e)) + logger.debug(ls.IMP_CACHE_CLEAR_ERROR, error=e) def get_stdlib_cache_stats() -> StdlibCacheStats: @@ -130,6 +130,8 @@ def get_stdlib_cache_stats() -> StdlibCacheStats: class StdlibExtractor: + __slots__ = ("function_registry", "repo_path", "project_name") + def __init__( self, function_registry: FunctionRegistryTrieProtocol | None = None, @@ -248,7 +250,7 @@ def _resolve_python_entity_module_path( result = ( cs.SEPARATOR_DOT.join(parts[:-1]) - if entity_name[0].isupper() + if entity_name[:1].isupper() else full_qualified_name ) _cache_stdlib_result(cs.SupportedLanguage.PYTHON, full_qualified_name, result) @@ -330,11 +332,7 @@ def _resolve_js_entity_module_path( ): pass - result = ( - cs.SEPARATOR_DOT.join(parts[:-1]) - if entity_name[0].isupper() - else full_qualified_name - ) + result = cs.SEPARATOR_DOT.join(parts[:-1]) _cache_stdlib_result(cs.SupportedLanguage.JS, full_qualified_name, result) return result @@ -464,7 +462,7 @@ def _extract_go_stdlib_path(self, full_qualified_name: str) -> str: pass entity_name = parts[-1] - if entity_name[0].isupper(): + if entity_name[:1].isupper(): return cs.SEPARATOR_SLASH.join(parts[:-1]) return full_qualified_name @@ -475,7 +473,7 @@ def _extract_rust_stdlib_path(self, full_qualified_name: str) -> str: entity_name = parts[-1] if ( - entity_name[0].isupper() + entity_name[:1].isupper() or entity_name.isupper() or (cs.CHAR_UNDERSCORE not in entity_name and entity_name.islower()) ): @@ -541,7 +539,7 @@ def _extract_cpp_stdlib_path(self, full_qualified_name: str) -> str: entity_name = parts[-1] if ( - entity_name[0].isupper() + entity_name[:1].isupper() or entity_name.startswith(cs.CPP_PREFIX_IS) or entity_name.startswith(cs.CPP_PREFIX_HAS) or entity_name in cs.CPP_STDLIB_ENTITIES @@ -551,132 +549,45 @@ def _extract_cpp_stdlib_path(self, full_qualified_name: str) -> str: return full_qualified_name def _extract_java_stdlib_path(self, full_qualified_name: str) -> str: + cached_result = _get_cached_stdlib_result( + cs.SupportedLanguage.JAVA, full_qualified_name + ) + if cached_result is not None: + return cached_result + parts = full_qualified_name.split(cs.SEPARATOR_DOT) if len(parts) >= 2: - try: - import os - import subprocess - import tempfile - - package_name = cs.SEPARATOR_DOT.join(parts[:-1]) - entity_name = parts[-1] - - java_program = """ -import java.lang.reflect.*; - -public class StdlibCheck { - public static void main(String[] args) { - if (args.length < 2) { - System.out.println("{\\"hasEntity\\": false}"); - return; - } - - String packageName = args[0]; - String entityName = args[1]; - - try { - Class clazz = Class.forName(packageName + "." + entityName); - System.out.println("{\\"hasEntity\\": true, \\"entityType\\": \\"class\\"}"); - } catch (ClassNotFoundException e) { - // Try as method or field in parent package - try { - Class packageClass = Class.forName(packageName); - Method[] methods = packageClass.getMethods(); - Field[] fields = packageClass.getFields(); - - boolean foundMethod = false; - for (Method method : methods) { - if (method.getName().equals(entityName)) { - foundMethod = true; - break; - } - } - - boolean foundField = false; - for (Field field : fields) { - if (field.getName().equals(entityName)) { - foundField = true; - break; - } - } - - if (foundMethod || foundField) { - System.out.println("{\\"hasEntity\\": true, \\"entityType\\": \\"member\\"}"); - } else { - System.out.println("{\\"hasEntity\\": false}"); - } - } catch (Exception ex) { - System.out.println("{\\"hasEntity\\": false}"); - } - } - } -} - """ - - with tempfile.NamedTemporaryFile( - mode="w", suffix=".java", delete=False - ) as f: - f.write(java_program) - java_file = f.name - - try: - compile_result = subprocess.run( - ["javac", java_file], - check=False, - capture_output=True, - text=True, - timeout=10, - ) - - if compile_result.returncode == 0: - class_name = os.path.splitext(os.path.basename(java_file))[0] - run_result = subprocess.run( - [ - "java", - "-cp", - os.path.dirname(java_file), - class_name, - package_name, - entity_name, - ], - check=False, - capture_output=True, - text=True, - timeout=10, - ) - - if run_result.returncode == 0: - data = json.loads(run_result.stdout.strip()) - if data.get(cs.JSON_KEY_HAS_ENTITY): - return cs.SEPARATOR_DOT.join(parts[:-1]) - - finally: - for ext in (cs.EXT_JAVA, cs.EXT_CLASS): - temp_file = os.path.splitext(java_file)[0] + ext - try: - os.unlink(temp_file) - except OSError: - pass - - except ( - subprocess.TimeoutExpired, - subprocess.CalledProcessError, - json.JSONDecodeError, - OSError, - ): - pass - entity_name = parts[-1] - if ( - entity_name[0].isupper() + is_class_entity = ( + entity_name[:1].isupper() or entity_name.endswith(cs.JAVA_SUFFIX_EXCEPTION) or entity_name.endswith(cs.JAVA_SUFFIX_ERROR) or entity_name.endswith(cs.JAVA_SUFFIX_INTERFACE) or entity_name.endswith(cs.JAVA_SUFFIX_BUILDER) or entity_name in cs.JAVA_STDLIB_CLASSES - ): - return cs.SEPARATOR_DOT.join(parts[:-1]) + ) + if full_qualified_name.startswith(cs.JAVA_STDLIB_PREFIXES): + result = ( + cs.SEPARATOR_DOT.join(parts[:-1]) + if is_class_entity + else full_qualified_name + ) + _cache_stdlib_result( + cs.SupportedLanguage.JAVA, full_qualified_name, result + ) + return result + + if is_class_entity: + result = cs.SEPARATOR_DOT.join(parts[:-1]) + _cache_stdlib_result( + cs.SupportedLanguage.JAVA, full_qualified_name, result + ) + return result + + _cache_stdlib_result( + cs.SupportedLanguage.JAVA, full_qualified_name, full_qualified_name + ) return full_qualified_name def _extract_lua_stdlib_path(self, full_qualified_name: str) -> str: @@ -750,7 +661,7 @@ def _extract_lua_stdlib_path(self, full_qualified_name: str) -> str: pass entity_name = parts[-1] - if entity_name[0].isupper() or entity_name in cs.LUA_STDLIB_MODULES: + if entity_name[:1].isupper() or entity_name in cs.LUA_STDLIB_MODULES: return cs.SEPARATOR_DOT.join(parts[:-1]) return full_qualified_name @@ -759,7 +670,7 @@ def _extract_generic_stdlib_path(self, full_qualified_name: str) -> str: parts = full_qualified_name.split(cs.SEPARATOR_DOT) if len(parts) >= 2: entity_name = parts[-1] - if entity_name[0].isupper(): + if entity_name[:1].isupper(): return cs.SEPARATOR_DOT.join(parts[:-1]) return full_qualified_name diff --git a/codebase_rag/parsers/structure_processor.py b/codebase_rag/parsers/structure_processor.py index 9b4065bd3..f10165769 100644 --- a/codebase_rag/parsers/structure_processor.py +++ b/codebase_rag/parsers/structure_processor.py @@ -10,6 +10,16 @@ class StructureProcessor: + __slots__ = ( + "ingestor", + "repo_path", + "project_name", + "queries", + "structural_elements", + "unignore_paths", + "exclude_paths", + ) + def __init__( self, ingestor: IngestorProtocol, @@ -79,6 +89,7 @@ def identify_structure(self) -> None: cs.KEY_QUALIFIED_NAME: package_qn, cs.KEY_NAME: root.name, cs.KEY_PATH: relative_root.as_posix(), + cs.KEY_ABSOLUTE_PATH: root.resolve().as_posix(), }, ) parent_identifier = self._get_parent_identifier( @@ -96,7 +107,11 @@ def identify_structure(self) -> None: ) self.ingestor.ensure_node_batch( cs.NodeLabel.FOLDER, - {cs.KEY_PATH: relative_root.as_posix(), cs.KEY_NAME: root.name}, + { + cs.KEY_PATH: relative_root.as_posix(), + cs.KEY_NAME: root.name, + cs.KEY_ABSOLUTE_PATH: root.resolve().as_posix(), + }, ) parent_identifier = self._get_parent_identifier( parent_rel_path, parent_container_qn @@ -122,6 +137,7 @@ def process_generic_file(self, file_path: Path, file_name: str) -> None: cs.KEY_PATH: relative_filepath, cs.KEY_NAME: file_name, cs.KEY_EXTENSION: file_path.suffix, + cs.KEY_ABSOLUTE_PATH: file_path.resolve().as_posix(), }, ) diff --git a/codebase_rag/parsers/type_inference.py b/codebase_rag/parsers/type_inference.py index 815e4af81..d0aee7164 100644 --- a/codebase_rag/parsers/type_inference.py +++ b/codebase_rag/parsers/type_inference.py @@ -19,6 +19,22 @@ class TypeInferenceEngine: + __slots__ = ( + "import_processor", + "function_registry", + "repo_path", + "project_name", + "ast_cache", + "queries", + "module_qn_to_file_path", + "class_inheritance", + "simple_name_lookup", + "_java_type_inference", + "_lua_type_inference", + "_js_type_inference", + "_python_type_inference", + ) + def __init__( self, import_processor: ImportProcessor, diff --git a/codebase_rag/parsers/utils.py b/codebase_rag/parsers/utils.py index b164a5022..82f9234f6 100644 --- a/codebase_rag/parsers/utils.py +++ b/codebase_rag/parsers/utils.py @@ -2,6 +2,7 @@ from collections.abc import Callable from functools import lru_cache +from pathlib import Path from typing import TYPE_CHECKING, NamedTuple from loguru import logger @@ -29,6 +30,15 @@ class FunctionCapturesResult(NamedTuple): captures: dict[str, list[ASTNode]] +def sorted_captures(cursor: QueryCursor, node: ASTNode) -> dict[str, list[ASTNode]]: + # (H) tree-sitter v0.25 captures() returns nodes in non-deterministic order + # across process invocations; sort by start_byte for reproducibility + raw = cursor.captures(node) + return { + name: sorted(nodes, key=lambda n: n.start_byte) for name, nodes in raw.items() + } + + def get_function_captures( root_node: ASTNode, language: cs.SupportedLanguage, @@ -41,11 +51,11 @@ def get_function_captures( return None cursor = QueryCursor(query) - captures = cursor.captures(root_node) + captures = sorted_captures(cursor, root_node) return FunctionCapturesResult(lang_config, captures) -@lru_cache(maxsize=10000) +@lru_cache(maxsize=50000) def _cached_decode_bytes(text_bytes: bytes) -> str: return text_bytes.decode(cs.ENCODING_UTF8) @@ -83,6 +93,8 @@ def ingest_method( language: cs.SupportedLanguage | None = None, extract_decorators_func: Callable[[ASTNode], list[str]] | None = None, method_qualified_name: str | None = None, + file_path: Path | None = None, + repo_path: Path | None = None, ) -> None: if language == cs.SupportedLanguage.CPP: from .cpp import utils as cpp_utils @@ -109,6 +121,9 @@ def ingest_method( cs.KEY_END_LINE: method_node.end_point[0] + 1, cs.KEY_DOCSTRING: get_docstring_func(method_node), } + if file_path is not None and repo_path is not None: + method_props[cs.KEY_PATH] = file_path.relative_to(repo_path).as_posix() + method_props[cs.KEY_ABSOLUTE_PATH] = file_path.resolve().as_posix() logger.info(logs.METHOD_FOUND.format(name=method_name, qn=method_qn)) ingestor.ensure_node_batch(cs.NodeLabel.METHOD, method_props) @@ -162,6 +177,11 @@ def is_method_node(func_node: ASTNode, lang_config: LanguageSpec) -> bool: return False while current and current.type not in lang_config.module_node_types: + if ( + current.type in lang_config.function_node_types + and current.child_by_field_name(cs.FIELD_BODY) is not None + ): + return False if current.type in lang_config.class_node_types: return True current = current.parent diff --git a/codebase_rag/prompts.py b/codebase_rag/prompts.py index de5cce132..48bbe8d4b 100644 --- a/codebase_rag/prompts.py +++ b/codebase_rag/prompts.py @@ -196,6 +196,14 @@ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: - CORRECT: `MATCH (c:Class) RETURN count(c) AS total` - WRONG: `MATCH (c:Class) RETURN c.name, count(c) AS total` (returns all items!) +**VALUE PATTERN RULES (CRITICAL FOR NAME MATCHING):** +- The `qualified_name` property contains FULL paths like: `'Project.folder.subfolder.ClassName'` +- When users mention a class or function by SHORT NAME (e.g., "VatManager", "UserService"), you MUST match using the `name` property, NOT `qualified_name`. +- CORRECT: `WHERE c.name = 'VatManager'` +- WRONG: `WHERE c.qualified_name = 'VatManager'` (will never match!) +- Use `DEFINES_METHOD` relationship to find methods of a class. +- Use `DEFINES` relationship to find functions/classes defined in a module. + **Examples:** * **Natural Language:** "How many classes are there?" @@ -235,7 +243,7 @@ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str: ``` * **Natural Language:** "What methods does UserService have?" or "Show me methods in UserService" or "List UserService methods" -* **Cypher Query (Use ENDS WITH to match class by short name):** +* **Cypher Query (Note: match by `name` property, use `DEFINES_METHOD` relationship):** ```cypher {CYPHER_EXAMPLE_CLASS_METHODS} ``` diff --git a/codebase_rag/providers/base.py b/codebase_rag/providers/base.py index 37f5cb462..f7dbb55f6 100644 --- a/codebase_rag/providers/base.py +++ b/codebase_rag/providers/base.py @@ -6,8 +6,13 @@ import httpx from loguru import logger +from pydantic_ai.models.anthropic import AnthropicModel from pydantic_ai.models.google import GoogleModel, GoogleModelSettings from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel +from pydantic_ai.providers.anthropic import ( + AnthropicProvider as PydanticAnthropicProvider, +) +from pydantic_ai.providers.azure import AzureProvider as PydanticAzureProvider from pydantic_ai.providers.google import GoogleProvider as PydanticGoogleProvider from pydantic_ai.providers.openai import OpenAIProvider as PydanticOpenAIProvider @@ -18,13 +23,15 @@ class ModelProvider(ABC): + __slots__ = ("config",) + def __init__(self, **config: str | int | None) -> None: self.config = config @abstractmethod def create_model( self, model_id: str, **kwargs: str | int | None - ) -> GoogleModel | OpenAIResponsesModel | OpenAIChatModel: + ) -> GoogleModel | OpenAIResponsesModel | OpenAIChatModel | AnthropicModel: pass @abstractmethod @@ -37,7 +44,25 @@ def provider_name(self) -> cs.Provider: pass +def _resolve_api_key(api_key: str | None, env_var: str) -> str | None: + env_key = os.environ.get(env_var) + if env_key: + return env_key + if api_key and api_key != cs.DEFAULT_API_KEY: + return api_key + return None + + class GoogleProvider(ModelProvider): + __slots__ = ( + "api_key", + "provider_type", + "project_id", + "region", + "service_account_file", + "thinking_budget", + ) + def __init__( self, api_key: str | None = None, @@ -49,7 +74,7 @@ def __init__( **kwargs: str | int | None, ) -> None: super().__init__(**kwargs) - self.api_key = api_key or os.environ.get(cs.ENV_GOOGLE_API_KEY) + self.api_key = _resolve_api_key(api_key, cs.ENV_GOOGLE_API_KEY) self.provider_type = provider_type self.project_id = project_id self.region = region @@ -98,6 +123,8 @@ def create_model(self, model_id: str, **kwargs: str | int | None) -> GoogleModel class OpenAIProvider(ModelProvider): + __slots__ = ("api_key", "endpoint") + def __init__( self, api_key: str | None = None, @@ -105,7 +132,7 @@ def __init__( **kwargs: str | int | None, ) -> None: super().__init__(**kwargs) - self.api_key = api_key or os.environ.get(cs.ENV_OPENAI_API_KEY) + self.api_key = _resolve_api_key(api_key, cs.ENV_OPENAI_API_KEY) self.endpoint = endpoint @property @@ -126,6 +153,8 @@ def create_model( class OllamaProvider(ModelProvider): + __slots__ = ("endpoint", "api_key") + def __init__( self, endpoint: str | None = None, @@ -155,12 +184,91 @@ def create_model( return OpenAIChatModel(model_id, provider=provider) +class AnthropicProvider(ModelProvider): + __slots__ = ("api_key",) + + def __init__( + self, + api_key: str | None = None, + **kwargs: str | int | None, + ) -> None: + super().__init__(**kwargs) + self.api_key = _resolve_api_key(api_key, cs.ENV_ANTHROPIC_API_KEY) + + @property + def provider_name(self) -> cs.Provider: + return cs.Provider.ANTHROPIC + + def validate_config(self) -> None: + if not self.api_key: + raise ValueError(ex.ANTHROPIC_NO_KEY) + + def create_model(self, model_id: str, **kwargs: str | int | None) -> AnthropicModel: + self.validate_config() + # (H) api_key is guaranteed to be set by validate_config + assert self.api_key is not None + provider = PydanticAnthropicProvider(api_key=self.api_key) + return AnthropicModel(model_id, provider=provider) + + +class AzureOpenAIProvider(ModelProvider): + __slots__ = ("api_key", "endpoint", "api_version") + + def __init__( + self, + api_key: str | None = None, + endpoint: str | None = None, + api_version: str | None = None, + **kwargs: str | int | None, + ) -> None: + super().__init__(**kwargs) + self.api_key = _resolve_api_key(api_key, cs.ENV_AZURE_API_KEY) + self.endpoint = endpoint or os.environ.get(cs.ENV_AZURE_ENDPOINT) + self.api_version = api_version or os.environ.get(cs.ENV_AZURE_API_VERSION) + + @property + def provider_name(self) -> cs.Provider: + return cs.Provider.AZURE + + def validate_config(self) -> None: + if not self.api_key: + raise ValueError(ex.AZURE_NO_KEY) + if not self.endpoint: + raise ValueError(ex.AZURE_NO_ENDPOINT) + + def create_model( + self, model_id: str, **kwargs: str | int | None + ) -> OpenAIChatModel: + self.validate_config() + # (H) api_key and endpoint are guaranteed to be set by validate_config + assert self.api_key is not None + assert self.endpoint is not None + provider = PydanticAzureProvider( + api_key=self.api_key, + azure_endpoint=self.endpoint, + api_version=self.api_version, + ) + return OpenAIChatModel(model_id, provider=provider) + + PROVIDER_REGISTRY: dict[str, type[ModelProvider]] = { cs.Provider.GOOGLE: GoogleProvider, cs.Provider.OPENAI: OpenAIProvider, cs.Provider.OLLAMA: OllamaProvider, + cs.Provider.ANTHROPIC: AnthropicProvider, + cs.Provider.AZURE: AzureOpenAIProvider, } +# Import LiteLLM provider after base classes are defined to avoid circular import +try: + from .litellm import LiteLLMProvider + + PROVIDER_REGISTRY[cs.Provider.LITELLM_PROXY] = LiteLLMProvider + _litellm_available = True +except ImportError as e: + logger.debug(f"LiteLLM provider not available: {e}") + _litellm_available = False + def get_provider( provider_name: str | cs.Provider, **config: str | int | None @@ -207,3 +315,29 @@ def check_ollama_running(endpoint: str | None = None) -> bool: return response.status_code == cs.HTTP_OK except (httpx.RequestError, httpx.TimeoutException): return False + + +def check_litellm_proxy_running( + endpoint: str = "http://localhost:4000", api_key: str | None = None +) -> bool: + try: + base_url = endpoint.rstrip("/v1").rstrip("/") + health_url = urljoin(base_url, "/health") + headers: dict[str, str] = {} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + with httpx.Client(timeout=settings.LITELLM_HEALTH_TIMEOUT) as client: + response = client.get(health_url, headers=headers) + if response.status_code == cs.HTTP_OK: + return True + + # (H) Fallback to models endpoint for authenticated proxies + if api_key: + models_url = urljoin(base_url, "/v1/models") + response = client.get(models_url, headers=headers) + return response.status_code == cs.HTTP_OK + + return False + except (httpx.RequestError, httpx.TimeoutException): + return False diff --git a/codebase_rag/providers/litellm.py b/codebase_rag/providers/litellm.py new file mode 100644 index 000000000..7fc0360c3 --- /dev/null +++ b/codebase_rag/providers/litellm.py @@ -0,0 +1,50 @@ +"""LiteLLM provider using pydantic-ai's native LiteLLMProvider.""" + +from __future__ import annotations + +from loguru import logger +from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.providers.litellm import LiteLLMProvider as PydanticLiteLLMProvider + +from codebase_rag import constants as cs +from codebase_rag import exceptions as ex + +from .base import ModelProvider + + +class LiteLLMProvider(ModelProvider): + __slots__ = ("api_key", "endpoint") + + def __init__( + self, + api_key: str | None = None, + endpoint: str = "http://localhost:4000/v1", + **kwargs: str | int | None, + ) -> None: + super().__init__(**kwargs) + self.api_key = api_key + self.endpoint = endpoint + + @property + def provider_name(self) -> cs.Provider: + return cs.Provider.LITELLM_PROXY + + def validate_config(self) -> None: + if not self.endpoint: + raise ValueError(ex.LITELLM_NO_ENDPOINT) + + from .base import check_litellm_proxy_running + + base_url = self.endpoint.rstrip("/v1").rstrip("/") + if not check_litellm_proxy_running(base_url, api_key=self.api_key): + raise ValueError(ex.LITELLM_NOT_RUNNING.format(endpoint=base_url)) + + def create_model( + self, model_id: str, **kwargs: str | int | None + ) -> OpenAIChatModel: + self.validate_config() + + logger.info(f"Creating LiteLLM proxy model: {model_id} at {self.endpoint}") + + provider = PydanticLiteLLMProvider(api_key=self.api_key, api_base=self.endpoint) + return OpenAIChatModel(model_id, provider=provider) diff --git a/codebase_rag/services/graph_service.py b/codebase_rag/services/graph_service.py index 7a8d95e02..342eeae1b 100644 --- a/codebase_rag/services/graph_service.py +++ b/codebase_rag/services/graph_service.py @@ -1,14 +1,17 @@ from __future__ import annotations +import threading import types from collections import defaultdict from collections.abc import Generator, Sequence -from contextlib import contextmanager +from concurrent.futures import ThreadPoolExecutor, as_completed +from contextlib import contextmanager, nullcontext from datetime import UTC, datetime import mgclient # ty: ignore[unresolved-import] from loguru import logger +from codebase_rag.config import settings from codebase_rag.types_defs import CursorProtocol, ResultValue from .. import exceptions as ex @@ -32,6 +35,8 @@ CYPHER_EXPORT_RELATIONSHIPS, CYPHER_LIST_PROJECTS, build_constraint_query, + build_create_node_query, + build_create_relationship_query, build_index_query, build_merge_node_query, build_merge_relationship_query, @@ -51,27 +56,53 @@ class MemgraphIngestor: - def __init__(self, host: str, port: int, batch_size: int = 1000): + __slots__ = ( + "_conn_lock", + "_executor", + "_host", + "_port", + "_username", + "_password", + "_use_merge", + "_rel_count", + "_rel_groups", + "batch_size", + "conn", + "node_buffer", + ) + + def __init__( + self, + host: str, + port: int, + batch_size: int = 1000, + username: str | None = None, + password: str | None = None, + use_merge: bool = True, + ): self._host = host self._port = port + self._username = username.strip() if username and username.strip() else None + self._password = password.strip() if password and password.strip() else None + if (self._username is None) != (self._password is None): + raise ValueError(ex.AUTH_INCOMPLETE) if batch_size < 1: raise ValueError(ex.BATCH_SIZE) self.batch_size = batch_size + self._use_merge = use_merge + self._conn_lock = threading.Lock() + self._executor: ThreadPoolExecutor | None = None self.conn: mgclient.Connection | None = None self.node_buffer: list[tuple[str, dict[str, PropertyValue]]] = [] - self.relationship_buffer: list[ - tuple[ - tuple[str, str, PropertyValue], - str, - tuple[str, str, PropertyValue], - dict[str, PropertyValue] | None, - ] - ] = [] + self._rel_count = 0 + self._rel_groups: defaultdict[ + tuple[str, str, str, str, str], list[RelBatchRow] + ] = defaultdict(list) def __enter__(self) -> MemgraphIngestor: logger.info(ls.MG_CONNECTING.format(host=self._host, port=self._port)) - self.conn = mgclient.connect(host=self._host, port=self._port) - self.conn.autocommit = True + self.conn = self._create_connection() + self._executor = ThreadPoolExecutor(max_workers=settings.FLUSH_THREAD_POOL_SIZE) logger.info(ls.MG_CONNECTED) return self @@ -81,24 +112,49 @@ def __exit__( exc_val: Exception | None, exc_tb: types.TracebackType | None, ) -> None: - if exc_type: - logger.exception(ls.MG_EXCEPTION.format(error=exc_val)) - self.flush_all() - if self.conn: - self.conn.close() - logger.info(ls.MG_DISCONNECTED) + try: + if exc_type: + logger.exception(ls.MG_EXCEPTION.format(error=exc_val)) + # (H) Best-effort flush: attempt to persist buffered nodes/relationships + # (H) even when an exception occurred. Catching broad Exception so a + # (H) secondary flush failure never masks the original exception. + try: + self.flush_all() + except Exception as flush_err: + logger.error(ls.MG_FLUSH_ERROR.format(error=flush_err)) + else: + self.flush_all() + finally: + if self._executor: + self._executor.shutdown(wait=True) + self._executor = None + if self.conn: + self.conn.close() + logger.info(ls.MG_DISCONNECTED) + + async def __aenter__(self) -> MemgraphIngestor: + return self.__enter__() + + async def __aexit__( + self, + exc_type: type | None, + exc_val: Exception | None, + exc_tb: types.TracebackType | None, + ) -> None: + self.__exit__(exc_type, exc_val, exc_tb) @contextmanager def _get_cursor(self) -> Generator[CursorProtocol, None, None]: if not self.conn: raise ConnectionError(ex.CONN) - cursor: CursorProtocol | None = None - try: - cursor = self.conn.cursor() - yield cursor - finally: - if cursor: - cursor.close() + with self._conn_lock: + cursor: CursorProtocol | None = None + try: + cursor = self.conn.cursor() + yield cursor + finally: + if cursor: + cursor.close() def _cursor_to_results(self, cursor: CursorProtocol) -> list[ResultRow]: if not cursor.description: @@ -128,12 +184,30 @@ def _execute_query( logger.error(ls.MG_CYPHER_PARAMS.format(params=params)) raise - def _execute_batch(self, query: str, params_list: Sequence[BatchParams]) -> None: - if not self.conn or not params_list: + def _create_connection(self) -> mgclient.Connection: + if self._username is not None: + conn = mgclient.connect( + host=self._host, + port=self._port, + username=self._username, + password=self._password, + ) + else: + conn = mgclient.connect(host=self._host, port=self._port) + conn.autocommit = True + return conn + + def _execute_batch_on( + self, + conn: mgclient.Connection, + query: str, + params_list: Sequence[BatchParams], + ) -> None: + if not params_list: return cursor = None try: - cursor = self.conn.cursor() + cursor = conn.cursor() cursor.execute(wrap_with_unwind(query), BatchWrapper(batch=params_list)) except Exception as e: if ERR_SUBSTR_ALREADY_EXISTS not in str(e).lower(): @@ -152,14 +226,17 @@ def _execute_batch(self, query: str, params_list: Sequence[BatchParams]) -> None if cursor: cursor.close() - def _execute_batch_with_return( - self, query: str, params_list: Sequence[BatchParams] + def _execute_batch_with_return_on( + self, + conn: mgclient.Connection, + query: str, + params_list: Sequence[BatchParams], ) -> list[ResultRow]: - if not self.conn or not params_list: + if not params_list: return [] cursor = None try: - cursor = self.conn.cursor() + cursor = conn.cursor() cursor.execute(wrap_with_unwind(query), BatchWrapper(batch=params_list)) return self._cursor_to_results(cursor) except Exception as e: @@ -208,7 +285,7 @@ def ensure_node_batch( ) -> None: self.node_buffer.append((label, properties)) if len(self.node_buffer) >= self.batch_size: - logger.debug(ls.MG_NODE_BUFFER_FLUSH.format(size=self.batch_size)) + logger.debug(ls.MG_NODE_BUFFER_FLUSH, size=self.batch_size) self.flush_nodes() def ensure_relationship_batch( @@ -220,19 +297,82 @@ def ensure_relationship_batch( ) -> None: from_label, from_key, from_val = from_spec to_label, to_key, to_val = to_spec - self.relationship_buffer.append( - ( - (from_label, from_key, from_val), - rel_type, - (to_label, to_key, to_val), - properties, - ) + pattern = (from_label, from_key, rel_type, to_label, to_key) + self._rel_groups[pattern].append( + RelBatchRow(from_val=from_val, to_val=to_val, props=properties or {}) ) - if len(self.relationship_buffer) >= self.batch_size: - logger.debug(ls.MG_REL_BUFFER_FLUSH.format(size=self.batch_size)) + self._rel_count += 1 + if self._rel_count >= self.batch_size: + logger.debug(ls.MG_REL_BUFFER_FLUSH, size=self.batch_size) self.flush_nodes() self.flush_relationships() + def _flush_node_label_group( + self, + label: str, + props_list: list[dict[str, PropertyValue]], + conn: mgclient.Connection | None = None, + ) -> tuple[int, int]: + if not props_list: + return 0, 0 + + id_key = NODE_UNIQUE_CONSTRAINTS.get(label) + if not id_key: + logger.warning(ls.MG_NO_CONSTRAINT.format(label=label)) + return 0, len(props_list) + + batch_rows: list[NodeBatchRow] = [] + skipped = 0 + for props in props_list: + if id_key not in props: + logger.warning( + ls.MG_MISSING_PROP.format( + label=label, key=id_key, prop_keys=list(props.keys()) + ) + ) + skipped += 1 + continue + row_props: PropertyDict = {k: v for k, v in props.items() if k != id_key} + batch_rows.append(NodeBatchRow(id=props[id_key], props=row_props)) + + if not batch_rows: + return 0, skipped + + build_query = ( + build_merge_node_query if self._use_merge else build_create_node_query + ) + query = build_query(label, id_key) + target_conn = conn or self.conn + if not target_conn: + logger.warning(ls.MG_NO_CONN_NODES.format(label=label)) + return 0, skipped + len(batch_rows) + lock = self._conn_lock if conn is None else nullcontext() + with lock: + self._execute_batch_on(target_conn, query, batch_rows) + return len(batch_rows), skipped + + def _flush_node_group_with_own_conn( + self, + label: str, + props_list: list[dict[str, PropertyValue]], + ) -> tuple[int, int]: + conn = self._create_connection() + try: + return self._flush_node_label_group(label, props_list, conn=conn) + finally: + conn.close() + + def _flush_rel_group_with_own_conn( + self, + pattern: tuple[str, str, str, str, str], + params_list: list[RelBatchRow], + ) -> tuple[int, int]: + conn = self._create_connection() + try: + return self._flush_rel_pattern_group(pattern, params_list, conn=conn) + finally: + conn.close() + def flush_nodes(self) -> None: if not self.node_buffer: return @@ -243,37 +383,46 @@ def flush_nodes(self) -> None: ) for label, props in self.node_buffer: nodes_by_label[label].append(props) + flushed_total = 0 skipped_total = 0 - for label, props_list in nodes_by_label.items(): - if not props_list: - continue - id_key = NODE_UNIQUE_CONSTRAINTS.get(label) - if not id_key: - logger.warning(ls.MG_NO_CONSTRAINT.format(label=label)) - skipped_total += len(props_list) - continue - batch_rows: list[NodeBatchRow] = [] - for props in props_list: - if id_key not in props: - logger.warning( - ls.MG_MISSING_PROP.format(label=label, key=id_key, props=props) - ) - skipped_total += 1 - continue - row_props: PropertyDict = { - k: v for k, v in props.items() if k != id_key - } - batch_rows.append(NodeBatchRow(id=props[id_key], props=row_props)) - - if not batch_rows: - continue + first_error: Exception | None = None - flushed_total += len(batch_rows) + if self._executor and len(nodes_by_label) > 1: + logger.info( + ls.MG_PARALLEL_FLUSH_NODES.format( + count=len(nodes_by_label), + workers=settings.FLUSH_THREAD_POOL_SIZE, + ) + ) + futures = { + self._executor.submit( + self._flush_node_group_with_own_conn, label, props_list + ): label + for label, props_list in nodes_by_label.items() + } + for future in as_completed(futures): + label = futures[future] + try: + flushed, skipped = future.result() + flushed_total += flushed + skipped_total += skipped + except Exception as e: + logger.error(ls.MG_LABEL_FLUSH_ERROR.format(label=label, error=e)) + if first_error is None: + first_error = e + else: + for label, props_list in nodes_by_label.items(): + try: + flushed, skipped = self._flush_node_label_group(label, props_list) + flushed_total += flushed + skipped_total += skipped + except Exception as e: + logger.error(ls.MG_LABEL_FLUSH_ERROR.format(label=label, error=e)) + if first_error is None: + first_error = e - query = build_merge_node_query(label, id_key) - self._execute_batch(query, batch_rows) logger.info( ls.MG_NODES_FLUSHED.format(flushed=flushed_total, total=buffer_size) ) @@ -281,61 +430,114 @@ def flush_nodes(self) -> None: logger.info(ls.MG_NODES_SKIPPED.format(count=skipped_total)) self.node_buffer.clear() - def flush_relationships(self) -> None: - if not self.relationship_buffer: - return + if first_error is not None: + raise first_error - rels_by_pattern: defaultdict[ - tuple[str, str, str, str, str], list[RelBatchRow] - ] = defaultdict(list) - for from_node, rel_type, to_node, props in self.relationship_buffer: - pattern = (from_node[0], from_node[1], rel_type, to_node[0], to_node[1]) - rels_by_pattern[pattern].append( - RelBatchRow(from_val=from_node[2], to_val=to_node[2], props=props or {}) + def _flush_rel_pattern_group( + self, + pattern: tuple[str, str, str, str, str], + params_list: list[RelBatchRow], + conn: mgclient.Connection | None = None, + ) -> tuple[int, int]: + from_label, from_key, rel_type, to_label, to_key = pattern + build_rel_query = ( + build_merge_relationship_query + if self._use_merge + else build_create_relationship_query + ) + has_props = any(p[KEY_PROPS] for p in params_list) + query = build_rel_query( + from_label, from_key, rel_type, to_label, to_key, has_props + ) + + target_conn = conn or self.conn + if not target_conn: + logger.warning(ls.MG_NO_CONN_RELS.format(pattern=pattern)) + return len(params_list), 0 + lock = self._conn_lock if conn is None else nullcontext() + with lock: + results = self._execute_batch_with_return_on( + target_conn, query, params_list ) + batch_successful = 0 + for r in results: + created = r.get(KEY_CREATED, 0) + if isinstance(created, int): + batch_successful += created + + if rel_type == REL_TYPE_CALLS: + failed = len(params_list) - batch_successful + if failed > 0: + logger.warning(ls.MG_CALLS_FAILED.format(count=failed)) + for i, sample in enumerate(params_list[:3]): + logger.warning( + ls.MG_CALLS_SAMPLE.format( + index=i + 1, + from_label=from_label, + from_val=sample[KEY_FROM_VAL], + to_label=to_label, + to_val=sample[KEY_TO_VAL], + ) + ) + + return len(params_list), batch_successful + + def flush_relationships(self) -> None: + if not self._rel_count: + return total_attempted = 0 total_successful = 0 - - for pattern, params_list in rels_by_pattern.items(): - from_label, from_key, rel_type, to_label, to_key = pattern - has_props = any(p[KEY_PROPS] for p in params_list) - query = build_merge_relationship_query( - from_label, from_key, rel_type, to_label, to_key, has_props + first_error: Exception | None = None + + if self._executor and len(self._rel_groups) > 1: + logger.info( + ls.MG_PARALLEL_FLUSH_RELS.format( + count=len(self._rel_groups), + workers=settings.FLUSH_THREAD_POOL_SIZE, + ) ) - - total_attempted += len(params_list) - results = self._execute_batch_with_return(query, params_list) - batch_successful = 0 - for r in results: - created = r.get(KEY_CREATED, 0) - if isinstance(created, int): - batch_successful += created - total_successful += batch_successful - - if rel_type == REL_TYPE_CALLS: - failed = len(params_list) - batch_successful - if failed > 0: - logger.warning(ls.MG_CALLS_FAILED.format(count=failed)) - for i, sample in enumerate(params_list[:3]): - logger.warning( - ls.MG_CALLS_SAMPLE.format( - index=i + 1, - from_label=from_label, - from_val=sample[KEY_FROM_VAL], - to_label=to_label, - to_val=sample[KEY_TO_VAL], - ) - ) + futures = { + self._executor.submit( + self._flush_rel_group_with_own_conn, pattern, params_list + ): pattern + for pattern, params_list in self._rel_groups.items() + } + for future in as_completed(futures): + pattern = futures[future] + try: + attempted, successful = future.result() + total_attempted += attempted + total_successful += successful + except Exception as e: + logger.error(ls.MG_REL_FLUSH_ERROR.format(pattern=pattern, error=e)) + if first_error is None: + first_error = e + else: + for pattern, params_list in self._rel_groups.items(): + try: + attempted, successful = self._flush_rel_pattern_group( + pattern, params_list + ) + total_attempted += attempted + total_successful += successful + except Exception as e: + logger.error(ls.MG_REL_FLUSH_ERROR.format(pattern=pattern, error=e)) + if first_error is None: + first_error = e logger.info( ls.MG_RELS_FLUSHED.format( - total=len(self.relationship_buffer), + total=self._rel_count, success=total_successful, failed=total_attempted - total_successful, ) ) - self.relationship_buffer.clear() + self._rel_count = 0 + self._rel_groups.clear() + + if first_error is not None: + raise first_error def flush_all(self) -> None: logger.info(ls.MG_FLUSH_START) @@ -346,13 +548,13 @@ def flush_all(self) -> None: def fetch_all( self, query: str, params: dict[str, PropertyValue] | None = None ) -> list[ResultRow]: - logger.debug(ls.MG_FETCH_QUERY.format(query=query, params=params)) + logger.debug(ls.MG_FETCH_QUERY, query=query, params=params) return self._execute_query(query, params) def execute_write( self, query: str, params: dict[str, PropertyValue] | None = None ) -> None: - logger.debug(ls.MG_WRITE_QUERY.format(query=query, params=params)) + logger.debug(ls.MG_WRITE_QUERY, query=query, params=params) self._execute_query(query, params) def export_graph_to_dict(self) -> GraphData: diff --git a/codebase_rag/services/llm.py b/codebase_rag/services/llm.py index 018ccc1af..916437737 100644 --- a/codebase_rag/services/llm.py +++ b/codebase_rag/services/llm.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import TYPE_CHECKING from loguru import logger @@ -26,15 +27,73 @@ def _create_provider_model(config: ModelConfig) -> Model: def _clean_cypher_response(response_text: str) -> str: - query = response_text.strip().replace(cs.CYPHER_BACKTICK, "") - if query.startswith(cs.CYPHER_PREFIX): - query = query[len(cs.CYPHER_PREFIX) :].strip() + """Clean LLM response to extract pure Cypher query. + + Handles markdown formatting that models sometimes output: + - Triple backticks (```cypher ... ```) + - Bold text (**Cypher Query:**) + - Headers and other markdown + """ + query = response_text.strip() + + # Extract content from code blocks (```cypher ... ``` or ``` ... ```) + if "```" in query: + parts = query.split("```") + if len(parts) >= 3: + block = parts[1] + if block.lower().startswith("cypher"): + block = block[len("cypher") :] + query = block.strip() + else: + # Remove markdown bold/headers (e.g., **Cypher Query:**) + while "**" in query: + start = query.index("**") + end = query.find("**", start + 2) + if end == -1: + break + after = end + 2 + if after < len(query) and query[after] == ":": + after += 1 + query = query[:start] + query[after:].lstrip() + # Remove single backticks + query = query.replace(cs.CYPHER_BACKTICK, "") + # Remove "cypher" prefix if present + if query.lower().startswith(cs.CYPHER_PREFIX): + query = query[len(cs.CYPHER_PREFIX) :].strip() + if not query.endswith(cs.CYPHER_SEMICOLON): query += cs.CYPHER_SEMICOLON return query +_COMMENT_OR_WS = r"(?:\s|//[^\n]*|/\*.*?\*/)+" + + +def _build_keyword_pattern(keyword: str) -> re.Pattern[str]: + parts = keyword.split() + if len(parts) == 1: + return re.compile(rf"\b{re.escape(parts[0])}\b") + joined = _COMMENT_OR_WS.join(re.escape(p) for p in parts) + return re.compile(rf"\b{joined}\b", re.DOTALL) + + +_CYPHER_DANGEROUS_PATTERNS: list[tuple[str, re.Pattern[str]]] = [ + (kw, _build_keyword_pattern(kw)) for kw in cs.CYPHER_DANGEROUS_KEYWORDS +] + + +def _validate_cypher_read_only(query: str) -> None: + upper_query = query.upper() + for keyword, pattern in _CYPHER_DANGEROUS_PATTERNS: + if pattern.search(upper_query): + raise ex.LLMGenerationError( + ex.LLM_DANGEROUS_QUERY.format(keyword=keyword, query=query) + ) + + class CypherGenerator: + __slots__ = ("agent",) + def __init__(self) -> None: try: config = settings.active_cypher_config @@ -68,6 +127,7 @@ async def generate(self, natural_language_query: str) -> str: ) query = _clean_cypher_response(result.output) + _validate_cypher_read_only(query) logger.info(ls.CYPHER_GENERATED.format(query=query)) return query except Exception as e: diff --git a/codebase_rag/services/protobuf_service.py b/codebase_rag/services/protobuf_service.py index 7c5138c12..e129cafce 100644 --- a/codebase_rag/services/protobuf_service.py +++ b/codebase_rag/services/protobuf_service.py @@ -22,6 +22,10 @@ cs.NodeLabel.EXTERNAL_PACKAGE: cs.ONEOF_EXTERNAL_PACKAGE, cs.NodeLabel.MODULE_IMPLEMENTATION: cs.ONEOF_MODULE_IMPLEMENTATION, cs.NodeLabel.MODULE_INTERFACE: cs.ONEOF_MODULE_INTERFACE, + cs.NodeLabel.INTERFACE: cs.ONEOF_INTERFACE, + cs.NodeLabel.ENUM: cs.ONEOF_ENUM, + cs.NodeLabel.TYPE: cs.ONEOF_TYPE, + cs.NodeLabel.UNION: cs.ONEOF_UNION, } ONEOF_FIELD_TO_LABEL: dict[str, cs.NodeLabel] = { @@ -33,6 +37,8 @@ class ProtobufFileIngestor: + __slots__ = ("output_dir", "_nodes", "_relationships", "split_index") + def __init__(self, output_path: str, split_index: bool = False): self.output_dir = Path(output_path) self._nodes: dict[str, pb.Node] = {} diff --git a/codebase_rag/tests/conftest.py b/codebase_rag/tests/conftest.py index a22c1ede0..3ba1ec6dd 100644 --- a/codebase_rag/tests/conftest.py +++ b/codebase_rag/tests/conftest.py @@ -8,14 +8,13 @@ from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Protocol, Self -from unittest.mock import MagicMock +from unittest.mock import MagicMock, call import pytest from loguru import logger from codebase_rag.graph_updater import GraphUpdater from codebase_rag.parser_loader import load_parsers -from codebase_rag.services.graph_service import MemgraphIngestor if TYPE_CHECKING: pass # ty: ignore[unresolved-import] @@ -97,10 +96,44 @@ def temp_repo() -> Generator[Path, None, None]: shutil.rmtree(temp_dir) +class _MockIngestor: + _TRACKED = ( + "fetch_all", + "execute_write", + "ensure_node_batch", + "ensure_relationship_batch", + "flush_all", + ) + + def __init__(self) -> None: + self.fetch_all = MagicMock() + self.execute_write = MagicMock() + self.ensure_node_batch = MagicMock() + self.ensure_relationship_batch = MagicMock() + self.flush_all = MagicMock() + self._fallback = MagicMock() + + def reset_mock(self) -> None: + for name in (*self._TRACKED, "_fallback"): + getattr(self, name).reset_mock() + + @property + def method_calls(self) -> list: + result = [] + for name in self._TRACKED: + mock_attr = self.__dict__[name] + for c in mock_attr.call_args_list: + result.append(getattr(call, name)(*c.args, **c.kwargs)) + result.extend(self._fallback.method_calls) + return result + + def __getattr__(self, name: str) -> MagicMock: + return getattr(self._fallback, name) + + @pytest.fixture -def mock_ingestor() -> MagicMock: - """Provides a mocked MemgraphIngestor instance.""" - return MagicMock(spec=MemgraphIngestor) +def mock_ingestor() -> _MockIngestor: + return _MockIngestor() def run_updater( diff --git a/codebase_rag/tests/fuzz_test_parsers.py b/codebase_rag/tests/fuzz_test_parsers.py new file mode 100644 index 000000000..d9a608887 --- /dev/null +++ b/codebase_rag/tests/fuzz_test_parsers.py @@ -0,0 +1,20 @@ +import sys + +import atheris + +from codebase_rag.language_spec import ( + get_language_for_extension, + get_language_spec, +) + + +def fuzz_language_spec(data): + fdp = atheris.FuzzedDataProvider(data) + extension = fdp.ConsumeUnicodeNoSurrogates(64) + get_language_spec(extension) + get_language_for_extension(extension) + + +if __name__ == "__main__": + atheris.Setup(sys.argv, fuzz_language_spec) + atheris.Fuzz() diff --git a/codebase_rag/tests/integration/test_node_label_e2e.py b/codebase_rag/tests/integration/test_node_label_e2e.py index f61792588..769ed14ff 100644 --- a/codebase_rag/tests/integration/test_node_label_e2e.py +++ b/codebase_rag/tests/integration/test_node_label_e2e.py @@ -17,7 +17,6 @@ SKIP_GO = "Go is in development status" SKIP_SCALA = "Scala is in development status" SKIP_CSHARP = "C# is in development status" -SKIP_PHP = "PHP is in development status" PYTHON_CODE = """\ @@ -617,29 +616,29 @@ def test_rust_creates_function_nodes( func_names = {n["name"] for n in functions} assert "standalone_fn" in func_names - def test_rust_creates_class_nodes_for_enums( + def test_rust_creates_enum_nodes_for_enums( self, memgraph_ingestor: MemgraphIngestor, rust_project: Path ) -> None: index_project(memgraph_ingestor, rust_project) labels = get_node_labels(memgraph_ingestor) - assert NodeLabel.CLASS.value in labels + assert NodeLabel.ENUM.value in labels - classes = get_nodes_by_label(memgraph_ingestor, NodeLabel.CLASS.value) - class_names = {n["name"] for n in classes} - assert "Status" in class_names + enums = get_nodes_by_label(memgraph_ingestor, NodeLabel.ENUM.value) + enum_names = {n["name"] for n in enums} + assert "Status" in enum_names - def test_rust_creates_class_nodes_for_traits( + def test_rust_creates_interface_nodes_for_traits( self, memgraph_ingestor: MemgraphIngestor, rust_project: Path ) -> None: index_project(memgraph_ingestor, rust_project) labels = get_node_labels(memgraph_ingestor) - assert NodeLabel.CLASS.value in labels + assert NodeLabel.INTERFACE.value in labels - classes = get_nodes_by_label(memgraph_ingestor, NodeLabel.CLASS.value) - class_names = {n["name"] for n in classes} - assert "MyTrait" in class_names + interfaces = get_nodes_by_label(memgraph_ingestor, NodeLabel.INTERFACE.value) + interface_names = {n["name"] for n in interfaces} + assert "MyTrait" in interface_names @pytest.mark.skip(reason=SKIP_GO) @@ -864,7 +863,6 @@ def test_csharp_creates_enum_nodes( assert "Status" in enum_names -@pytest.mark.skip(reason=SKIP_PHP) class TestPhpNodeLabels: def test_php_creates_class_nodes( self, memgraph_ingestor: MemgraphIngestor, php_project: Path @@ -939,7 +937,7 @@ def test_lua_creates_function_nodes( ("java_project", None), ("cpp_project", None), ("csharp_project", SKIP_CSHARP), - ("php_project", SKIP_PHP), + ("php_project", None), ("lua_project", None), ] diff --git a/codebase_rag/tests/integration/test_shell_command_integration.py b/codebase_rag/tests/integration/test_shell_command_integration.py index c5fda3f68..47391b6c0 100644 --- a/codebase_rag/tests/integration/test_shell_command_integration.py +++ b/codebase_rag/tests/integration/test_shell_command_integration.py @@ -1,5 +1,6 @@ from __future__ import annotations +import shutil from pathlib import Path from unittest.mock import MagicMock @@ -11,6 +12,8 @@ create_shell_command_tool, ) +_HAS_RG = shutil.which("rg") is not None + pytestmark = [pytest.mark.anyio, pytest.mark.integration] @@ -112,6 +115,7 @@ async def test_rm_removes_file( assert result.return_code == 0 assert not (temp_test_repo / "file2.py").exists() + @pytest.mark.skipif(not _HAS_RG, reason="rg (ripgrep) not installed") async def test_rg_searches_content(self, shell_commander: ShellCommander) -> None: result = await shell_commander.execute("rg hello file2.py") assert "hello" in result.stdout or result.return_code == 0 @@ -199,6 +203,7 @@ async def test_ls_pipe_head(self, shell_commander: ShellCommander) -> None: lines = result.stdout.strip().split("\n") assert len(lines) <= 2 + @pytest.mark.skipif(not _HAS_RG, reason="rg (ripgrep) not installed") async def test_cat_pipe_rg( self, shell_commander: ShellCommander, temp_test_repo: Path ) -> None: @@ -217,6 +222,7 @@ async def test_echo_pipe_wc(self, shell_commander: ShellCommander) -> None: assert result.return_code == 0 assert "3" in result.stdout + @pytest.mark.skipif(not _HAS_RG, reason="rg (ripgrep) not installed") async def test_find_pipe_rg_pipe_wc(self, shell_commander: ShellCommander) -> None: result = await shell_commander.execute("find . -name '*.py' | rg py | wc -l") assert result.return_code == 0 diff --git a/codebase_rag/tests/test_absolute_path.py b/codebase_rag/tests/test_absolute_path.py new file mode 100644 index 000000000..ede90839e --- /dev/null +++ b/codebase_rag/tests/test_absolute_path.py @@ -0,0 +1,317 @@ +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parser_loader import load_parsers +from codebase_rag.tests.conftest import get_nodes, run_updater + +TS_CODE = ( + "interface Greeter {\n" + " greet(): string;\n" + "}\n\n" + "enum Direction {\n" + " Up = 'UP',\n" + " Down = 'DOWN',\n" + "}\n\n" + "class MyGreeter implements Greeter {\n" + " greet(): string { return 'hi'; }\n" + "}\n" +) + +CPP_MODULE_INTERFACE = "export module mymod;\nexport int add(int a, int b);\n" + +CPP_MODULE_IMPL = "module mymod;\nint add(int a, int b) { return a + b; }\n" + + +@pytest.fixture(scope="module") +def parsers_and_queries() -> tuple: + return load_parsers() + + +@pytest.fixture +def python_project(temp_repo: Path) -> Path: + project_path = temp_repo / "abs_path_test" + project_path.mkdir() + + pkg_dir = project_path / "mypkg" + pkg_dir.mkdir() + (pkg_dir / "__init__.py").write_text("") + + (pkg_dir / "mymodule.py").write_text( + "class MyClass:\n" + " def my_method(self):\n" + " pass\n" + "\n" + "def my_function():\n" + " pass\n" + ) + + misc_dir = project_path / "misc" + misc_dir.mkdir() + (misc_dir / "notes.txt").write_text("not a package") + + (project_path / "standalone.py").write_text("def standalone_func():\n pass\n") + + return project_path + + +class TestAbsolutePathOnNodes: + def test_file_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + file_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FILE) + assert len(file_nodes) > 0 + for node_call in file_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + assert abs_path == Path(abs_path).resolve().as_posix() + + def test_module_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + internal_modules = [c for c in module_nodes if not c[0][1].get("is_external")] + assert len(internal_modules) > 0 + for node_call in internal_modules: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_package_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + package_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PACKAGE) + assert len(package_nodes) > 0 + for node_call in package_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_function_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + func_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FUNCTION) + assert len(func_nodes) > 0 + for node_call in func_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert cs.KEY_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_class_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + class_nodes = get_nodes(mock_ingestor, cs.NodeLabel.CLASS) + assert len(class_nodes) > 0 + for node_call in class_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert cs.KEY_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_method_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + method_nodes = get_nodes(mock_ingestor, cs.NodeLabel.METHOD) + assert len(method_nodes) > 0 + for node_call in method_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert cs.KEY_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_folder_nodes_have_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + folder_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FOLDER) + assert len(folder_nodes) > 0 + for node_call in folder_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + abs_path = props[cs.KEY_ABSOLUTE_PATH] + assert Path(abs_path).is_absolute() + + def test_absolute_path_matches_resolved_file( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + mymodule_nodes = [ + c for c in module_nodes if c[0][1].get(cs.KEY_NAME) == "mymodule.py" + ] + assert len(mymodule_nodes) == 1 + props = mymodule_nodes[0][0][1] + expected = (python_project / "mypkg" / "mymodule.py").resolve().as_posix() + assert props[cs.KEY_ABSOLUTE_PATH] == expected + + def test_absolute_path_is_posix_format( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + file_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FILE) + for node_call in file_nodes: + abs_path = node_call[0][1][cs.KEY_ABSOLUTE_PATH] + assert "\\" not in abs_path + + def test_project_node_has_no_absolute_path( + self, + python_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.PYTHON not in parsers_and_queries[0]: + pytest.skip("Python parser not available") + run_updater(python_project, mock_ingestor) + project_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PROJECT) + assert len(project_nodes) > 0 + for node_call in project_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH not in props + + +@pytest.fixture +def ts_project(temp_repo: Path) -> Path: + project_path = temp_repo / "ts_abs_test" + project_path.mkdir() + (project_path / "types.ts").write_text(TS_CODE) + return project_path + + +class TestTypeScriptAbsolutePath: + def test_interface_nodes_have_absolute_path( + self, + ts_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.TS not in parsers_and_queries[0]: + pytest.skip("TypeScript parser not available") + run_updater(ts_project, mock_ingestor) + interface_nodes = get_nodes(mock_ingestor, cs.NodeLabel.INTERFACE) + assert len(interface_nodes) > 0 + for node_call in interface_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert Path(props[cs.KEY_ABSOLUTE_PATH]).is_absolute() + + def test_enum_nodes_have_absolute_path( + self, + ts_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.TS not in parsers_and_queries[0]: + pytest.skip("TypeScript parser not available") + run_updater(ts_project, mock_ingestor) + enum_nodes = get_nodes(mock_ingestor, cs.NodeLabel.ENUM) + assert len(enum_nodes) > 0 + for node_call in enum_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert Path(props[cs.KEY_ABSOLUTE_PATH]).is_absolute() + + +@pytest.fixture +def cpp_module_project(temp_repo: Path) -> Path: + project_path = temp_repo / "cpp_abs_test" + project_path.mkdir() + (project_path / "mymod.cppm").write_text(CPP_MODULE_INTERFACE) + (project_path / "mymod_impl.cpp").write_text(CPP_MODULE_IMPL) + return project_path + + +class TestCppModuleAbsolutePath: + def test_module_interface_nodes_have_absolute_path( + self, + cpp_module_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.CPP not in parsers_and_queries[0]: + pytest.skip("C++ parser not available") + run_updater(cpp_module_project, mock_ingestor) + mi_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE_INTERFACE) + if len(mi_nodes) == 0: + pytest.skip("No ModuleInterface nodes produced") + for node_call in mi_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert Path(props[cs.KEY_ABSOLUTE_PATH]).is_absolute() + + def test_module_implementation_nodes_have_absolute_path( + self, + cpp_module_project: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + if cs.SupportedLanguage.CPP not in parsers_and_queries[0]: + pytest.skip("C++ parser not available") + run_updater(cpp_module_project, mock_ingestor) + mi_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE_IMPLEMENTATION) + if len(mi_nodes) == 0: + pytest.skip("No ModuleImplementation nodes produced") + for node_call in mi_nodes: + props = node_call[0][1] + assert cs.KEY_ABSOLUTE_PATH in props + assert Path(props[cs.KEY_ABSOLUTE_PATH]).is_absolute() diff --git a/codebase_rag/tests/test_c_language.py b/codebase_rag/tests/test_c_language.py new file mode 100644 index 000000000..e8253c6be --- /dev/null +++ b/codebase_rag/tests/test_c_language.py @@ -0,0 +1,371 @@ +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.tests.conftest import ( + get_node_names, + get_nodes, + get_relationships, + run_updater, +) + + +@pytest.fixture +def c_project(temp_repo: Path) -> Path: + project_path = temp_repo / "c_test_project" + project_path.mkdir() + + (project_path / "Makefile").write_text("all:\n\tgcc -o main main.c\n") + + (project_path / "main.c").write_text( + '#include "utils.h"\n' + "#include \n" + "\n" + "void greet(void) {\n" + ' printf("Hello\\n");\n' + "}\n" + "\n" + "int add(int a, int b) {\n" + " return a + b;\n" + "}\n" + "\n" + "int* get_ptr(void) {\n" + " static int x = 42;\n" + " return &x;\n" + "}\n" + "\n" + "int main(void) {\n" + " greet();\n" + " int result = add(1, 2);\n" + " int* p = get_ptr();\n" + " return 0;\n" + "}\n" + ) + + (project_path / "utils.h").write_text( + "#ifndef UTILS_H\n" + "#define UTILS_H\n" + "\n" + "int add(int a, int b);\n" + "void greet(void);\n" + "\n" + "#endif\n" + ) + + (project_path / "types.c").write_text( + "struct Point {\n" + " int x;\n" + " int y;\n" + "};\n" + "\n" + "union Value {\n" + " int i;\n" + " float f;\n" + "};\n" + "\n" + "enum Color {\n" + " RED,\n" + " GREEN,\n" + " BLUE\n" + "};\n" + ) + + return project_path + + +@pytest.fixture +def c_subdir_project(temp_repo: Path) -> Path: + project_path = temp_repo / "c_subdir_project" + project_path.mkdir() + + (project_path / "CMakeLists.txt").write_text( + "cmake_minimum_required(VERSION 3.10)\nproject(myapp)\n" + ) + + src_dir = project_path / "src" + src_dir.mkdir() + (src_dir / "Makefile").write_text("all:\n\tgcc -o app app.c\n") + + (src_dir / "app.c").write_text( + "void run(void) {}\n\nint main(void) {\n run();\n return 0;\n}\n" + ) + + return project_path + + +class TestCFunctionNodes: + def test_simple_function_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + assert any("add" in name for name in func_names) + + def test_void_function_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + assert any("greet" in name for name in func_names) + + def test_pointer_return_function_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + assert any("get_ptr" in name for name in func_names) + + def test_main_function_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + assert any("main" in name for name in func_names) + + def test_function_with_parameters( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FUNCTION) + add_nodes = [ + n for n in func_nodes if "add" in n[0][1].get(cs.KEY_QUALIFIED_NAME, "") + ] + assert len(add_nodes) > 0 + + +class TestCStructNodes: + def test_struct_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + class_names = get_node_names(mock_ingestor, cs.NodeLabel.CLASS) + assert any("Point" in name for name in class_names) + + def test_struct_has_qualified_name( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + class_nodes = get_nodes(mock_ingestor, cs.NodeLabel.CLASS) + point_nodes = [ + n for n in class_nodes if "Point" in n[0][1].get(cs.KEY_QUALIFIED_NAME, "") + ] + assert len(point_nodes) > 0 + qn = point_nodes[0][0][1][cs.KEY_QUALIFIED_NAME] + assert "." in qn + + +class TestCUnionNodes: + def test_union_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + union_names = get_node_names(mock_ingestor, cs.NodeLabel.UNION) + class_names = get_node_names(mock_ingestor, cs.NodeLabel.CLASS) + all_names = union_names | class_names + assert any("Value" in name for name in all_names) + + +class TestCEnumNodes: + def test_enum_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + enum_names = get_node_names(mock_ingestor, cs.NodeLabel.ENUM) + class_names = get_node_names(mock_ingestor, cs.NodeLabel.CLASS) + all_names = enum_names | class_names + assert any("Color" in name for name in all_names) + + +class TestCCallsRelationships: + def test_function_call_detected( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + calls = get_relationships(mock_ingestor, str(cs.RelationshipType.CALLS)) + assert len(calls) > 0 + + def test_main_calls_greet( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + calls = get_relationships(mock_ingestor, str(cs.RelationshipType.CALLS)) + call_pairs = [] + for c in calls: + src = c.args[0] if c.args else c[0][0] + tgt = c.args[2] if len(c.args) > 2 else c[0][2] + if isinstance(src, tuple) and isinstance(tgt, tuple): + call_pairs.append((src, tgt)) + found_greet = any( + "main" in str(src) and "greet" in str(tgt) for src, tgt in call_pairs + ) + assert found_greet + + def test_multiple_calls_from_main( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + calls = get_relationships(mock_ingestor, str(cs.RelationshipType.CALLS)) + main_calls = [ + c for c in calls if "main" in str(c.args[0] if c.args else c[0][0]) + ] + assert len(main_calls) >= 2 + + +class TestCDefinesRelationships: + def test_module_defines_functions( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + defines = get_relationships(mock_ingestor, str(cs.RelationshipType.DEFINES)) + assert len(defines) > 0 + + def test_main_module_defines_add( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + defines = get_relationships(mock_ingestor, str(cs.RelationshipType.DEFINES)) + found = any("add" in str(d) for d in defines) + assert found + + +class TestCImportsRelationships: + def test_include_creates_external_module( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + external_modules = [n for n in module_nodes if n[0][1].get(cs.KEY_IS_EXTERNAL)] + has_stdio = any("stdio" in str(n) for n in external_modules) + has_utils = any( + "utils" in n[0][1].get(cs.KEY_QUALIFIED_NAME, "") for n in module_nodes + ) + assert has_stdio or has_utils + + def test_include_utils_h_module_exists( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + module_qnames = {n[0][1].get(cs.KEY_QUALIFIED_NAME, "") for n in module_nodes} + assert any("utils" in qn for qn in module_qnames) + + +class TestCFileAndModuleNodes: + def test_c_file_nodes_created( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + file_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FILE) + file_paths = {n[0][1].get(cs.KEY_PATH, "") for n in file_nodes} + assert any("main.c" in p for p in file_paths) + assert any("types.c" in p for p in file_paths) + + def test_c_module_nodes_created( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + module_nodes = get_nodes(mock_ingestor, cs.NodeLabel.MODULE) + module_names = {n[0][1].get(cs.KEY_QUALIFIED_NAME, "") for n in module_nodes} + assert any("main" in name for name in module_names) + + def test_header_file_node_created( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + file_nodes = get_nodes(mock_ingestor, cs.NodeLabel.FILE) + file_paths = {n[0][1].get(cs.KEY_PATH, "") for n in file_nodes} + assert any("utils.h" in p for p in file_paths) + + +class TestCQualifiedNames: + def test_function_qualified_name_has_project( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + for name in func_names: + assert "." in name, f"Qualified name should contain '.': {name}" + + def test_function_qualified_name_format( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + func_names = get_node_names(mock_ingestor, cs.NodeLabel.FUNCTION) + add_names = [n for n in func_names if "add" in n] + assert len(add_names) > 0 + parts = add_names[0].split(".") + assert len(parts) >= 2 + + +class TestCPackageDetection: + def test_makefile_creates_package( + self, + c_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_project, mock_ingestor, skip_if_missing="c") + package_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PACKAGE) + assert len(package_nodes) > 0 + + def test_cmakelists_creates_package( + self, + c_subdir_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_subdir_project, mock_ingestor, skip_if_missing="c") + package_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PACKAGE) + assert len(package_nodes) > 0 + + def test_subdirectory_with_makefile_is_package( + self, + c_subdir_project: Path, + mock_ingestor: MagicMock, + ) -> None: + run_updater(c_subdir_project, mock_ingestor, skip_if_missing="c") + package_nodes = get_nodes(mock_ingestor, cs.NodeLabel.PACKAGE) + package_qnames = {n[0][1].get(cs.KEY_QUALIFIED_NAME, "") for n in package_nodes} + assert any("src" in qn for qn in package_qnames) diff --git a/codebase_rag/tests/test_call_processor.py b/codebase_rag/tests/test_call_processor.py index a6ae5cc34..e9dccf2c8 100644 --- a/codebase_rag/tests/test_call_processor.py +++ b/codebase_rag/tests/test_call_processor.py @@ -1153,8 +1153,10 @@ def test_logs_error_on_processing_failure( tree = parser.parse(b"def foo(): pass") root_node = tree.root_node + from codebase_rag.parsers.call_processor import CallProcessor + with patch.object( - call_processor, + CallProcessor, "_process_calls_in_functions", side_effect=RuntimeError("Simulated failure"), ): @@ -1166,9 +1168,9 @@ def test_logs_error_on_processing_failure( queries, ) mock_logger.error.assert_called_once() - error_call_args = mock_logger.error.call_args[0][0] - assert "test_module.py" in error_call_args - assert "Simulated failure" in error_call_args + error_call_args = mock_logger.error.call_args + assert "test_module.py" in str(error_call_args) + assert "Simulated failure" in str(error_call_args) def test_continues_after_error_in_single_file( self, @@ -1195,8 +1197,10 @@ def test_continues_after_error_in_single_file( tree = parser.parse(b"def foo(): pass") root_node = tree.root_node + from codebase_rag.parsers.call_processor import CallProcessor + with patch.object( - call_processor, + CallProcessor, "_process_calls_in_functions", side_effect=ValueError("Test exception"), ): @@ -1206,3 +1210,23 @@ def test_continues_after_error_in_single_file( cs.SupportedLanguage.PYTHON, queries, ) + + +class TestCallProcessorSlots: + def test_has_slots(self) -> None: + from codebase_rag.parsers.call_processor import CallProcessor + + assert hasattr(CallProcessor, "__slots__") + + def test_no_instance_dict(self, call_processor: CallProcessor) -> None: + assert not hasattr(call_processor, "__dict__") + + def test_rejects_arbitrary_attribute(self, call_processor: CallProcessor) -> None: + with pytest.raises(AttributeError): + call_processor.nonexistent_attr = 42 + + def test_slot_attributes_accessible(self, call_processor: CallProcessor) -> None: + assert hasattr(call_processor, "ingestor") + assert hasattr(call_processor, "repo_path") + assert hasattr(call_processor, "project_name") + assert hasattr(call_processor, "_resolver") diff --git a/codebase_rag/tests/test_call_processor_integration.py b/codebase_rag/tests/test_call_processor_integration.py index e388b96c4..b3b326ba7 100644 --- a/codebase_rag/tests/test_call_processor_integration.py +++ b/codebase_rag/tests/test_call_processor_integration.py @@ -793,7 +793,11 @@ def with_value(self, value): def build(self): return {} +def helper(): + pass + def main(): + helper() result = Builder().with_name("test").with_value(42).build() return result """, @@ -814,6 +818,10 @@ def main(): ] assert len(calls) >= 1 + # (H) Builder() is a class instantiation, not a function call + class_targets = [c for c in calls if c.args[2][0] == cs.NodeLabel.CLASS] + assert len(class_targets) == 0 + def test_handles_init_py_module_qn( self, temp_repo: Path, @@ -853,3 +861,90 @@ def package_func(): caller_qns = [c.args[0][2] for c in calls] package_callers = [qn for qn in caller_qns if "mypackage" in qn] assert len(package_callers) >= 1 + + +class TestModuleCallsClassFiltered: + def test_module_does_not_call_class_python( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + test_file = temp_repo / "test_module.py" + test_file.write_text( + encoding="utf-8", + data=""" +class MyClass: + def method(self): + pass + +def helper(): + pass + +helper() +""", + ) + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + updater.run() + + calls = [ + c + for c in mock_ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == cs.RelationshipType.CALLS + ] + + class_targets = [c for c in calls if c.args[2][0] == cs.NodeLabel.CLASS] + assert class_targets == [] + + helper_calls = [c for c in calls if "helper" in c.args[2][2]] + assert len(helper_calls) >= 1 + + def test_function_does_not_call_class_python( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple, + ) -> None: + parsers, queries = parsers_and_queries + if cs.SupportedLanguage.PYTHON not in parsers: + pytest.skip("Python parser not available") + + test_file = temp_repo / "test_module.py" + test_file.write_text( + encoding="utf-8", + data=""" +class MyClass: + pass + +def factory(): + obj = MyClass() + return obj +""", + ) + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + updater.run() + + calls = [ + c + for c in mock_ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == cs.RelationshipType.CALLS + ] + + class_targets = [c for c in calls if c.args[2][0] == cs.NodeLabel.CLASS] + assert class_targets == [] diff --git a/codebase_rag/tests/test_call_resolver.py b/codebase_rag/tests/test_call_resolver.py index da4108f95..f3b9688c9 100644 --- a/codebase_rag/tests/test_call_resolver.py +++ b/codebase_rag/tests/test_call_resolver.py @@ -1024,3 +1024,356 @@ def test_falls_back_to_trie(self, call_resolver: CallResolver) -> None: def test_returns_none_for_unknown(self, call_resolver: CallResolver) -> None: result = call_resolver.resolve_function_call("unknown_func", "proj.module") assert result is None + + +class TestDequeBfs: + def test_bfs_order_prefers_closer_parent(self, call_resolver: CallResolver) -> None: + call_resolver.function_registry["proj.base.ParentA.method"] = NodeType.METHOD + call_resolver.function_registry["proj.base.ParentB.method"] = NodeType.METHOD + call_resolver.class_inheritance["proj.module.Child"] = [ + "proj.base.ParentA", + "proj.base.ParentB", + ] + + result = call_resolver._resolve_inherited_method("proj.module.Child", "method") + assert result is not None + assert result[1] == "proj.base.ParentA.method" + + def test_bfs_finds_deep_ancestor_method(self, call_resolver: CallResolver) -> None: + call_resolver.function_registry["proj.base.Root.deep_method"] = NodeType.METHOD + call_resolver.class_inheritance["proj.module.Child"] = ["proj.mid.Middle"] + call_resolver.class_inheritance["proj.mid.Middle"] = ["proj.base.Root"] + + result = call_resolver._resolve_inherited_method( + "proj.module.Child", "deep_method" + ) + assert result is not None + assert result[1] == "proj.base.Root.deep_method" + + def test_bfs_no_infinite_loop_on_cycle(self, call_resolver: CallResolver) -> None: + call_resolver.class_inheritance["proj.A"] = ["proj.B"] + call_resolver.class_inheritance["proj.B"] = ["proj.A"] + + result = call_resolver._resolve_inherited_method("proj.A", "missing") + assert result is None + + +class TestSeparatorPattern: + def test_splits_on_dot(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("a.b.c") == ["a", "b", "c"] + + def test_splits_on_colon(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("module:func") == ["module", "func"] + + def test_splits_on_double_colon(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("crate::module::func") == [ + "crate", + "", + "module", + "", + "func", + ] + + def test_no_separator_returns_single_element(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("simple") == ["simple"] + + def test_last_element_matches_function_name(self) -> None: + from codebase_rag.parsers.call_resolver import _SEPARATOR_PATTERN + + assert _SEPARATOR_PATTERN.split("a.b.func")[-1] == "func" + assert _SEPARATOR_PATTERN.split("module:method")[-1] == "method" + + +class TestChainedMethodPattern: + def test_matches_final_method(self) -> None: + from codebase_rag.parsers.call_resolver import _CHAINED_METHOD_PATTERN + + match = _CHAINED_METHOD_PATTERN.search("obj.method().next") + assert match is not None + assert match[1] == "next" + + def test_no_match_on_parenthesized_suffix(self) -> None: + from codebase_rag.parsers.call_resolver import _CHAINED_METHOD_PATTERN + + match = _CHAINED_METHOD_PATTERN.search("obj.method()") + assert match is None + + def test_matches_deeply_chained(self) -> None: + from codebase_rag.parsers.call_resolver import _CHAINED_METHOD_PATTERN + + match = _CHAINED_METHOD_PATTERN.search("a.b().c().final_method") + assert match is not None + assert match[1] == "final_method" + + +class TestDeterministicResolution: + def test_trie_tiebreak_by_qualified_name(self, call_resolver: CallResolver) -> None: + # (H) Register multiple functions with the same simple name in different modules + # at equal import distance from the caller + call_resolver.function_registry["proj.alpha.utils.helper"] = NodeType.FUNCTION + call_resolver.function_registry["proj.beta.utils.helper"] = NodeType.FUNCTION + call_resolver.function_registry["proj.gamma.utils.helper"] = NodeType.FUNCTION + + results = [] + for _ in range(20): + result = call_resolver._try_resolve_via_trie("helper", "proj.delta.module") + assert result is not None + results.append(result[1]) + + # (H) All 20 runs must resolve to the same candidate (lexicographically first) + assert all(r == results[0] for r in results) + assert results[0] == "proj.alpha.utils.helper" + + def test_trie_tiebreak_picks_lexicographic_first( + self, call_resolver: CallResolver + ) -> None: + # (H) Deliberately insert in reverse lexicographic order + call_resolver.function_registry["proj.zoo.compute"] = NodeType.FUNCTION + call_resolver.function_registry["proj.mid.compute"] = NodeType.FUNCTION + call_resolver.function_registry["proj.aaa.compute"] = NodeType.FUNCTION + + result = call_resolver._try_resolve_via_trie("compute", "other.module") + assert result is not None + assert result[1] == "proj.aaa.compute" + + def test_trie_tiebreak_distance_still_wins( + self, call_resolver: CallResolver + ) -> None: + # (H) Closer module should win even if lexicographically later + call_resolver.function_registry["proj.far.away.process"] = NodeType.FUNCTION + call_resolver.function_registry["proj.module.process"] = NodeType.FUNCTION + + result = call_resolver._try_resolve_via_trie("process", "proj.module.caller") + assert result is not None + # (H) proj.module.process is closer to proj.module.caller + assert result[1] == "proj.module.process" + + def test_trie_many_candidates_deterministic( + self, call_resolver: CallResolver + ) -> None: + # (H) Register 10 equidistant candidates + names = [ + "proj.m09.run", + "proj.m05.run", + "proj.m01.run", + "proj.m07.run", + "proj.m03.run", + "proj.m08.run", + "proj.m02.run", + "proj.m06.run", + "proj.m04.run", + "proj.m10.run", + ] + for name in names: + call_resolver.function_registry[name] = NodeType.FUNCTION + + result = call_resolver._try_resolve_via_trie("run", "other.caller") + assert result is not None + assert result[1] == "proj.m01.run" + + def test_resolve_function_call_deterministic_across_runs( + self, call_resolver: CallResolver + ) -> None: + call_resolver.function_registry["pkg.svc_a.validate"] = NodeType.FUNCTION + call_resolver.function_registry["pkg.svc_b.validate"] = NodeType.FUNCTION + call_resolver.function_registry["pkg.svc_c.validate"] = NodeType.FUNCTION + + results = set() + for _ in range(10): + result = call_resolver.resolve_function_call( + "validate", "pkg.other.module", {}, None + ) + assert result is not None + results.add(result[1]) + + # (H) Must resolve to exactly one candidate across all runs + assert len(results) == 1 + + +class TestDeterministicFileOrder: + def test_eligible_files_are_sorted( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + + # (H) Create files in non-alphabetical order + for name in ["zebra.py", "alpha.py", "middle.py", "beta.py"]: + (temp_repo / name).write_text(f"def func_{name[0]}(): pass\n") + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + eligible = updater._collect_eligible_files() + paths_str = [str(f) for f in eligible] + + assert paths_str == sorted(paths_str) + + def test_graph_output_deterministic_across_runs(self, temp_repo: Path) -> None: + parsers, queries = load_parsers() + + (temp_repo / "mod_a.py").write_text( + "def shared(): pass\ndef call_a(): shared()\n" + ) + (temp_repo / "mod_b.py").write_text( + "def shared(): pass\ndef call_b(): shared()\n" + ) + + results = [] + for _ in range(5): + ingestor = MagicMock() + updater = GraphUpdater( + ingestor=ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + updater.run(force=True) + + calls = [ + (c.args[0][2], c.args[1], c.args[2][2]) + for c in ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == cs.RelationshipType.CALLS + ] + calls.sort() + results.append(calls) + + # (H) All 5 runs must produce identical call graphs + assert len(results[0]) > 0 + for i in range(1, len(results)): + assert results[i] == results[0] + + def _run_determinism_check(self, temp_repo: Path, runs: int = 5) -> None: + parsers, queries = load_parsers() + results = [] + for _ in range(runs): + ingestor = MagicMock() + updater = GraphUpdater( + ingestor=ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + updater.run(force=True) + + calls = [ + (c.args[0][2], c.args[2][2]) + for c in ingestor.ensure_relationship_batch.call_args_list + if c.args[1] == cs.RelationshipType.CALLS + ] + calls.sort() + results.append(calls) + + assert len(results[0]) > 0 + for i in range(1, len(results)): + assert results[i] == results[0] + + def test_javascript_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.JS not in parsers: + pytest.skip("JavaScript parser not available") + + (temp_repo / "utils.js").write_text( + "function helper() {}\nfunction worker() { helper(); }\n" + ) + (temp_repo / "main.js").write_text( + "function helper() {}\nfunction entry() { helper(); }\n" + ) + self._run_determinism_check(temp_repo) + + def test_typescript_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.TS not in parsers: + pytest.skip("TypeScript parser not available") + + (temp_repo / "service.ts").write_text( + "function validate(x: string): boolean { return true; }\n" + "function process() { validate('test'); }\n" + ) + (temp_repo / "handler.ts").write_text( + "function validate(x: string): boolean { return false; }\n" + "function handle() { validate('input'); }\n" + ) + self._run_determinism_check(temp_repo) + + def test_rust_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.RUST not in parsers: + pytest.skip("Rust parser not available") + + (temp_repo / "utils.rs").write_text( + "fn compute() -> i32 { 42 }\nfn run() { compute(); }\n" + ) + (temp_repo / "main.rs").write_text( + "fn compute() -> i32 { 0 }\nfn start() { compute(); }\n" + ) + self._run_determinism_check(temp_repo) + + def test_java_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.JAVA not in parsers: + pytest.skip("Java parser not available") + + (temp_repo / "Utils.java").write_text( + "public class Utils {\n" + " public static void process() {}\n" + " public static void run() { process(); }\n" + "}\n" + ) + (temp_repo / "Helper.java").write_text( + "public class Helper {\n" + " public static void process() {}\n" + " public static void execute() { process(); }\n" + "}\n" + ) + self._run_determinism_check(temp_repo) + + def test_cpp_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.CPP not in parsers: + pytest.skip("C++ parser not available") + + (temp_repo / "math.cpp").write_text( + "int calculate() { return 1; }\nint run() { return calculate(); }\n" + ) + (temp_repo / "logic.cpp").write_text( + "int calculate() { return 2; }\nint start() { return calculate(); }\n" + ) + self._run_determinism_check(temp_repo) + + def test_go_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.GO not in parsers: + pytest.skip("Go parser not available") + + (temp_repo / "util.go").write_text( + "package main\nfunc helper() {}\nfunc doWork() { helper() }\n" + ) + (temp_repo / "main.go").write_text( + "package main\nfunc helper() {}\nfunc run() { helper() }\n" + ) + self._run_determinism_check(temp_repo) + + def test_lua_deterministic(self, temp_repo: Path) -> None: + parsers, _ = load_parsers() + if cs.SupportedLanguage.LUA not in parsers: + pytest.skip("Lua parser not available") + + (temp_repo / "utils.lua").write_text( + "local function process() end\nlocal function run() process() end\n" + ) + (temp_repo / "main.lua").write_text( + "local function process() end\nlocal function start() process() end\n" + ) + self._run_determinism_check(temp_repo) diff --git a/codebase_rag/tests/test_cgr_shim.py b/codebase_rag/tests/test_cgr_shim.py new file mode 100644 index 000000000..b7cdbd8fc --- /dev/null +++ b/codebase_rag/tests/test_cgr_shim.py @@ -0,0 +1,41 @@ +import cgr + + +class TestCgrShimExports: + def test_all_symbols_importable(self) -> None: + for name in cgr.__all__: + assert hasattr(cgr, name), f"{name!r} listed in __all__ but not importable" + + def test_all_matches_module_exports(self) -> None: + public_attrs = {k for k in vars(cgr) if not k.startswith("_")} + assert set(cgr.__all__) == public_attrs + + def test_settings_is_canonical_instance(self) -> None: + from codebase_rag.config import settings + + assert cgr.settings is settings + + def test_embed_code_is_canonical_function(self) -> None: + from codebase_rag.embedder import embed_code + + assert cgr.embed_code is embed_code + + def test_graph_loader_is_canonical_class(self) -> None: + from codebase_rag.graph_loader import GraphLoader + + assert cgr.GraphLoader is GraphLoader + + def test_load_graph_is_canonical_function(self) -> None: + from codebase_rag.graph_loader import load_graph + + assert cgr.load_graph is load_graph + + def test_memgraph_ingestor_is_canonical_class(self) -> None: + from codebase_rag.services.graph_service import MemgraphIngestor + + assert cgr.MemgraphIngestor is MemgraphIngestor + + def test_cypher_generator_is_canonical_class(self) -> None: + from codebase_rag.services.llm import CypherGenerator + + assert cgr.CypherGenerator is CypherGenerator diff --git a/codebase_rag/tests/test_cgrignore.py b/codebase_rag/tests/test_cgrignore.py index 09cb814be..0740c228d 100644 --- a/codebase_rag/tests/test_cgrignore.py +++ b/codebase_rag/tests/test_cgrignore.py @@ -1,10 +1,13 @@ from __future__ import annotations +from collections.abc import Generator from pathlib import Path from unittest.mock import MagicMock, patch import pytest +from typer.testing import CliRunner +from codebase_rag.cli import app from codebase_rag.config import ( CGRIGNORE_FILENAME, EMPTY_CGRIGNORE, @@ -265,3 +268,137 @@ def test_unignore_included_when_user_selects_all( assert "vendor" in result assert ".git" in result assert "custom" in result + + +@pytest.fixture +def mock_memgraph_connect() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli.connect_memgraph") as mock_connect: + mock_ingestor = MagicMock() + mock_connect.return_value.__enter__ = MagicMock(return_value=mock_ingestor) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield mock_connect + + +class TestCgrignoreLoadedWithoutInteractiveSetup: + runner = CliRunner() + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_start_loads_cgrignore_without_interactive_setup( + self, + mock_load_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + cgrignore_patterns = CgrignorePatterns( + exclude=frozenset({"vendor", "build"}), + unignore=frozenset({"vendor/important"}), + ) + mock_load_cgrignore.return_value = cgrignore_patterns + + result = self.runner.invoke( + app, + ["start", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + mock_load_cgrignore.assert_called_once_with(tmp_path) + updater_kwargs = mock_graph_updater.call_args.kwargs + assert updater_kwargs["unignore_paths"] == frozenset({"vendor/important"}) + assert "vendor" in updater_kwargs["exclude_paths"] + assert "build" in updater_kwargs["exclude_paths"] + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.ProtobufFileIngestor") + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_index_loads_cgrignore_without_interactive_setup( + self, + mock_load_cgrignore: MagicMock, + mock_proto_ingestor: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + tmp_path: Path, + ) -> None: + cgrignore_patterns = CgrignorePatterns( + exclude=frozenset({"dist"}), + unignore=frozenset({"dist/assets"}), + ) + mock_load_cgrignore.return_value = cgrignore_patterns + + output_dir = str(tmp_path / "output") + + result = self.runner.invoke( + app, + ["index", "--repo-path", str(tmp_path), "-o", output_dir], + ) + + assert result.exit_code == 0, result.output + mock_load_cgrignore.assert_called_once_with(tmp_path) + updater_kwargs = mock_graph_updater.call_args.kwargs + assert updater_kwargs["unignore_paths"] == frozenset({"dist/assets"}) + assert "dist" in updater_kwargs["exclude_paths"] + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_start_merges_cli_excludes_with_cgrignore( + self, + mock_load_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + cgrignore_patterns = CgrignorePatterns( + exclude=frozenset({"from_cgrignore"}), + unignore=frozenset(), + ) + mock_load_cgrignore.return_value = cgrignore_patterns + + result = self.runner.invoke( + app, + [ + "start", + "--update-graph", + "--repo-path", + str(tmp_path), + "--exclude", + "from_cli", + ], + ) + + assert result.exit_code == 0, result.output + updater_kwargs = mock_graph_updater.call_args.kwargs + assert "from_cgrignore" in updater_kwargs["exclude_paths"] + assert "from_cli" in updater_kwargs["exclude_paths"] + + @patch("codebase_rag.cli.prompt_for_unignored_directories") + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_start_does_not_prompt_without_interactive_setup( + self, + mock_load_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_prompt: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + mock_load_cgrignore.return_value = CgrignorePatterns( + exclude=frozenset({"vendor"}), + unignore=frozenset({"vendor/keep"}), + ) + + result = self.runner.invoke( + app, + ["start", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + mock_prompt.assert_not_called() + mock_load_cgrignore.assert_called_once() diff --git a/codebase_rag/tests/test_cli_clean.py b/codebase_rag/tests/test_cli_clean.py new file mode 100644 index 000000000..eb58c8458 --- /dev/null +++ b/codebase_rag/tests/test_cli_clean.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import json +from collections.abc import Generator +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag import constants as cs +from codebase_rag.cli import app +from codebase_rag.config import CgrignorePatterns + +runner = CliRunner() + + +@pytest.fixture +def mock_memgraph_connect() -> Generator[MagicMock, None, None]: + with patch("codebase_rag.cli.connect_memgraph") as mock_connect: + mock_ingestor = MagicMock() + mock_connect.return_value.__enter__ = MagicMock(return_value=mock_ingestor) + mock_connect.return_value.__exit__ = MagicMock(return_value=False) + yield mock_connect + + +def _get_ingestor(mock_connect: MagicMock) -> MagicMock: + return mock_connect.return_value.__enter__.return_value + + +class TestCleanWithoutUpdateGraph: + def test_clean_alone_wipes_database( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + ingestor = _get_ingestor(mock_memgraph_connect) + ingestor.clean_database.assert_called_once() + + def test_clean_alone_deletes_hash_cache( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + cache_path.write_text(json.dumps({"file.py": "abc123"})) + + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + assert not cache_path.exists() + + def test_clean_alone_no_cache_file_still_succeeds( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + assert not cache_path.exists() + + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + + def test_clean_alone_does_not_invoke_graph_updater( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + with patch("codebase_rag.cli.GraphUpdater") as mock_updater: + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + mock_updater.assert_not_called() + + def test_clean_alone_skips_model_validation( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + with patch("codebase_rag.cli._update_and_validate_models") as mock_validate: + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + mock_validate.assert_not_called() + + def test_clean_alone_shows_clean_done_message( + self, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + result = runner.invoke( + app, + ["start", "--clean", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0 + assert cs.CLI_MSG_CLEAN_DONE in result.output + + +class TestCleanWithUpdateGraph: + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_clean_with_update_deletes_hash_cache( + self, + mock_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + mock_cgrignore.return_value = CgrignorePatterns( + exclude=frozenset(), unignore=frozenset() + ) + + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + cache_path.write_text(json.dumps({"file.py": "abc123"})) + + result = runner.invoke( + app, + ["start", "--clean", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + assert not cache_path.exists() + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_clean_with_update_calls_clean_database( + self, + mock_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + mock_cgrignore.return_value = CgrignorePatterns( + exclude=frozenset(), unignore=frozenset() + ) + + result = runner.invoke( + app, + ["start", "--clean", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + ingestor = _get_ingestor(mock_memgraph_connect) + ingestor.clean_database.assert_called_once() + + @patch("codebase_rag.cli.GraphUpdater") + @patch("codebase_rag.cli.load_parsers", return_value=({}, {})) + @patch("codebase_rag.cli.load_cgrignore_patterns") + def test_update_without_clean_preserves_hash_cache( + self, + mock_cgrignore: MagicMock, + mock_load_parsers: MagicMock, + mock_graph_updater: MagicMock, + mock_memgraph_connect: MagicMock, + tmp_path: Path, + ) -> None: + mock_cgrignore.return_value = CgrignorePatterns( + exclude=frozenset(), unignore=frozenset() + ) + + cache_path = tmp_path / cs.HASH_CACHE_FILENAME + cache_data = {"file.py": "abc123"} + cache_path.write_text(json.dumps(cache_data)) + + result = runner.invoke( + app, + ["start", "--update-graph", "--repo-path", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + assert cache_path.exists() + assert json.loads(cache_path.read_text()) == cache_data diff --git a/codebase_rag/tests/test_cli_smoke.py b/codebase_rag/tests/test_cli_smoke.py index 88b420e07..06a254bda 100644 --- a/codebase_rag/tests/test_cli_smoke.py +++ b/codebase_rag/tests/test_cli_smoke.py @@ -1,9 +1,15 @@ +import re import subprocess import sys +from importlib.metadata import version as get_version from pathlib import Path import pytest +from codebase_rag import constants as cs + +_ANSI_RE = re.compile(r"\x1b\[[0-9;]*m") + def test_help_command_works() -> None: repo_root = Path(__file__).parent.parent.parent @@ -15,14 +21,14 @@ def test_help_command_works() -> None: capture_output=True, text=True, timeout=30, + env={**__import__("os").environ, "NO_COLOR": "1"}, ) assert result.returncode == 0, f"Help command failed with: {result.stderr}" - assert "Usage:" in result.stdout or "usage:" in result.stdout.lower() - assert "--help" in result.stdout - - assert result.stderr == "", f"Unexpected stderr: {result.stderr}" + plain_stdout = _ANSI_RE.sub("", result.stdout) + assert "Usage:" in plain_stdout or "usage:" in plain_stdout.lower() + assert "--help" in plain_stdout def test_import_cli_module() -> None: @@ -32,3 +38,28 @@ def test_import_cli_module() -> None: assert hasattr(cli, "app"), "CLI module missing app attribute" except ImportError as e: pytest.fail(f"Failed to import cli module: {e}") + + +def test_version_flag() -> None: + repo_root = Path(__file__).parent.parent.parent + + for flag in ["--version", "-v"]: + result = subprocess.run( + [sys.executable, "-m", "codebase_rag.cli", flag], + check=False, + cwd=repo_root, + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0, ( + f"{flag} exited with code {result.returncode}: {result.stderr}" + ) + expected = cs.CLI_MSG_VERSION.format( + package=cs.PACKAGE_NAME, version=get_version(cs.PACKAGE_NAME) + ) + assert result.stdout.strip() == expected, ( + f"{flag} output did not match expected format: {repr(result.stdout)}" + ) + assert result.stderr == "", f"Unexpected stderr for {flag}: {result.stderr}" diff --git a/codebase_rag/tests/test_codebase_query.py b/codebase_rag/tests/test_codebase_query.py index 3be753570..47d56fff9 100644 --- a/codebase_rag/tests/test_codebase_query.py +++ b/codebase_rag/tests/test_codebase_query.py @@ -69,6 +69,22 @@ def test_uses_provided_console( tool = create_query_tool(mock_ingestor, mock_cypher_gen, console=mock_console) assert tool is not None + async def test_default_console_writes_to_stderr( + self, + mock_ingestor: MagicMock, + mock_cypher_gen: MagicMock, + capsys: pytest.CaptureFixture[str], + ) -> None: + mock_cypher_gen.generate = AsyncMock(return_value="MATCH (n) RETURN n") + mock_ingestor.fetch_all.return_value = [{"name": "example"}] + + tool = create_query_tool(mock_ingestor, mock_cypher_gen, console=None) + await tool.function(natural_language_query="Find all functions") + + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err != "" + class TestQueryCodebaseKnowledgeGraph: async def test_successful_query_returns_results( diff --git a/codebase_rag/tests/test_config_validation.py b/codebase_rag/tests/test_config_validation.py new file mode 100644 index 000000000..c17c51a26 --- /dev/null +++ b/codebase_rag/tests/test_config_validation.py @@ -0,0 +1,85 @@ +import pytest + +from codebase_rag import constants as cs +from codebase_rag.config import ModelConfig, format_missing_api_key_errors + + +class TestValidateApiKey: + def test_local_providers_skip_validation(self) -> None: + cfg = ModelConfig(provider=cs.Provider.OLLAMA, model_id="llama3") + cfg.validate_api_key() + + def test_google_vertex_skips_validation(self) -> None: + cfg = ModelConfig( + provider=cs.Provider.GOOGLE, + model_id="gemini-pro", + provider_type=cs.GoogleProviderType.VERTEX, + ) + cfg.validate_api_key() + + def test_google_gla_requires_api_key(self) -> None: + cfg = ModelConfig( + provider=cs.Provider.GOOGLE, + model_id="gemini-pro", + provider_type=cs.GoogleProviderType.GLA, + ) + with pytest.raises(ValueError, match="API Key Missing"): + cfg.validate_api_key() + + @pytest.mark.parametrize( + "api_key_kwargs", + [ + {}, + {"api_key": ""}, + {"api_key": " "}, + {"api_key": cs.DEFAULT_API_KEY}, + ], + ) + def test_invalid_api_key_raises(self, api_key_kwargs: dict[str, str]) -> None: + cfg = ModelConfig( + provider=cs.Provider.OPENAI, model_id="gpt-4", **api_key_kwargs + ) + with pytest.raises(ValueError, match="API Key Missing"): + cfg.validate_api_key() + + def test_valid_api_key_passes(self) -> None: + cfg = ModelConfig( + provider=cs.Provider.OPENAI, model_id="gpt-4", api_key="sk-real-key-123" + ) + cfg.validate_api_key() + + def test_role_forwarded_to_error_message(self) -> None: + cfg = ModelConfig(provider=cs.Provider.OPENAI, model_id="gpt-4") + with pytest.raises(ValueError, match="cypher"): + cfg.validate_api_key(role="cypher") + + +class TestFormatMissingApiKeyErrors: + def test_known_provider_openai(self) -> None: + msg = format_missing_api_key_errors(cs.Provider.OPENAI) + assert "OPENAI_API_KEY" in msg + assert "https://platform.openai.com/api-keys" in msg + assert "OpenAI" in msg + + def test_known_provider_anthropic(self) -> None: + msg = format_missing_api_key_errors(cs.Provider.ANTHROPIC) + assert "ANTHROPIC_API_KEY" in msg + assert "Anthropic" in msg + + def test_unknown_provider_generic_message(self) -> None: + msg = format_missing_api_key_errors("deepseek") + assert "DEEPSEEK_API_KEY" in msg + assert "Deepseek" in msg + + def test_role_appears_in_message(self) -> None: + msg = format_missing_api_key_errors(cs.Provider.OPENAI, role="cypher") + assert "for cypher" in msg + + def test_default_role_omits_role_from_message(self) -> None: + msg = format_missing_api_key_errors(cs.Provider.OPENAI) + assert "for model" not in msg + + def test_case_insensitive_lookup(self) -> None: + msg = format_missing_api_key_errors("OpenAI") + assert "OPENAI_API_KEY" in msg + assert "OpenAI" in msg diff --git a/codebase_rag/tests/test_cpp_cross_file_methods.py b/codebase_rag/tests/test_cpp_cross_file_methods.py new file mode 100644 index 000000000..dbc2662de --- /dev/null +++ b/codebase_rag/tests/test_cpp_cross_file_methods.py @@ -0,0 +1,462 @@ +"""Tests for C++ cross-file out-of-class method resolution (issue #496). + +When a class is declared in a header (.h) and methods are implemented +out-of-class in a source file (.cpp) using ``ClassName::method`` syntax, +the Method nodes must link back to the correct Class node via +DEFINES_METHOD edges -- not to a phantom class constructed from the +.cpp module's qualified name. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.constants import SEPARATOR_DOT +from codebase_rag.tests.conftest import ( + get_nodes, + get_relationships, + run_updater, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _get_method_qns(mock_ingestor: MagicMock) -> set[str]: + """Return all Method qualified names recorded in the ingestor.""" + return {call[0][1]["qualified_name"] for call in get_nodes(mock_ingestor, "Method")} + + +def _get_class_qns(mock_ingestor: MagicMock) -> set[str]: + """Return all Class qualified names recorded in the ingestor.""" + return {call[0][1]["qualified_name"] for call in get_nodes(mock_ingestor, "Class")} + + +def _get_defines_method_edges( + mock_ingestor: MagicMock, +) -> list[tuple[str, str]]: + """Return ``(class_qn, method_qn)`` pairs from DEFINES_METHOD rels.""" + edges: list[tuple[str, str]] = [] + for rel in get_relationships(mock_ingestor, "DEFINES_METHOD"): + class_qn = rel.args[0][2] + method_qn = rel.args[2][2] + edges.append((class_qn, method_qn)) + return edges + + +def _method_names_for_class(mock_ingestor: MagicMock, class_name: str) -> set[str]: + """Method simple-names linked via DEFINES_METHOD to *class_name*.""" + names: set[str] = set() + for class_qn, method_qn in _get_defines_method_edges(mock_ingestor): + parts = class_qn.split(SEPARATOR_DOT) + if class_name in parts: + names.add(method_qn.split(SEPARATOR_DOT)[-1]) + return names + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def cpp_cross_file_project(temp_repo: Path) -> Path: + project = temp_repo / "cpp_cross_file" + project.mkdir() + return project + + +# --------------------------------------------------------------------------- +# Test: basic header + source cross-file methods +# --------------------------------------------------------------------------- + + +def test_header_source_method_resolution( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Class in .h, implementations in .cpp -- methods must link to .h class.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Calculator.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +class Calculator { +public: + int add(int a, int b); + int subtract(int a, int b); + double divide(int a, int b); +}; +""", + ) + + (src / "Calculator.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Calculator.h" + +int Calculator::add(int a, int b) { + return a + b; +} + +int Calculator::subtract(int a, int b) { + return a - b; +} + +double Calculator::divide(int a, int b) { + if (b == 0) return 0; + return static_cast(a) / b; +} +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + # The class should exist in the header module. + class_qns = _get_class_qns(mock_ingestor) + header_class = [qn for qn in class_qns if "include" in qn and "Calculator" in qn] + assert header_class, ( + f"Expected a Calculator class in include/, got classes: {class_qns}" + ) + + # All three out-of-class methods should have DEFINES_METHOD edges + # pointing to the *header* class, not to a phantom class in src/. + edges = _get_defines_method_edges(mock_ingestor) + header_class_qn = header_class[0] + methods_linked_to_header = { + mq.split(SEPARATOR_DOT)[-1] for cq, mq in edges if cq == header_class_qn + } + + assert "add" in methods_linked_to_header, ( + f"'add' not linked to header class. Edges: {edges}" + ) + assert "subtract" in methods_linked_to_header, ( + f"'subtract' not linked to header class. Edges: {edges}" + ) + assert "divide" in methods_linked_to_header, ( + f"'divide' not linked to header class. Edges: {edges}" + ) + + # There should be NO orphan Method nodes (methods whose container_qn + # uses the .cpp module instead of the .h module). + method_qns = _get_method_qns(mock_ingestor) + orphan_methods = { + qn + for qn in method_qns + if "src.Calculator" in qn and "Calculator.Calculator" in qn + } + assert not orphan_methods, ( + f"Found orphan methods with .cpp module QN: {orphan_methods}" + ) + + +# --------------------------------------------------------------------------- +# Test: multiple source files implementing one header class +# --------------------------------------------------------------------------- + + +def test_multiple_source_files_one_class( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Two .cpp files implement methods of one class declared in .h.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Engine.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +class Engine { +public: + void start(); + void stop(); + void accelerate(int speed); + void brake(); +}; +""", + ) + + (src / "engine_control.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Engine.h" + +void Engine::start() { /* ... */ } +void Engine::stop() { /* ... */ } +""", + ) + + (src / "engine_movement.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Engine.h" + +void Engine::accelerate(int speed) { /* ... */ } +void Engine::brake() { /* ... */ } +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + class_qns = _get_class_qns(mock_ingestor) + header_classes = [qn for qn in class_qns if "include" in qn and "Engine" in qn] + assert header_classes, f"Expected Engine class in include/, got: {class_qns}" + header_class_qn = header_classes[0] + + edges = _get_defines_method_edges(mock_ingestor) + methods_linked = { + mq.split(SEPARATOR_DOT)[-1] for cq, mq in edges if cq == header_class_qn + } + + for method_name in ("start", "stop", "accelerate", "brake"): + assert method_name in methods_linked, ( + f"'{method_name}' not linked to header Engine class. " + f"Linked methods: {methods_linked}" + ) + + +# --------------------------------------------------------------------------- +# Test: constructor and destructor out-of-class across files +# --------------------------------------------------------------------------- + + +def test_cross_file_constructor_destructor( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Constructors and destructors implemented in .cpp link to .h class.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Resource.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +class Resource { +public: + Resource(); + Resource(int size); + ~Resource(); + void reset(); +private: + int* data_; +}; +""", + ) + + (src / "Resource.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Resource.h" + +Resource::Resource() : data_(nullptr) {} + +Resource::Resource(int size) { + data_ = new int[size]; +} + +Resource::~Resource() { + delete[] data_; +} + +void Resource::reset() { + delete[] data_; + data_ = nullptr; +} +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + class_qns = _get_class_qns(mock_ingestor) + header_classes = [qn for qn in class_qns if "include" in qn and "Resource" in qn] + assert header_classes, f"Expected Resource class in include/, got: {class_qns}" + header_class_qn = header_classes[0] + + edges = _get_defines_method_edges(mock_ingestor) + methods_linked = { + mq.split(SEPARATOR_DOT)[-1] for cq, mq in edges if cq == header_class_qn + } + + assert "Resource" in methods_linked, ( + f"Constructor not linked to header class. Methods: {methods_linked}" + ) + assert "~Resource" in methods_linked, ( + f"Destructor not linked to header class. Methods: {methods_linked}" + ) + assert "reset" in methods_linked, ( + f"'reset' not linked to header class. Methods: {methods_linked}" + ) + + +# --------------------------------------------------------------------------- +# Test: nested namespace cross-file methods +# --------------------------------------------------------------------------- + + +def test_nested_namespace_cross_file( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Class inside nested namespaces, methods implemented in separate .cpp.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Logger.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +namespace app { +namespace logging { + +class Logger { +public: + void info(const char* msg); + void error(const char* msg); +}; + +} // namespace logging +} // namespace app +""", + ) + + (src / "Logger.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Logger.h" + +namespace app { +namespace logging { + +void Logger::info(const char* msg) { /* ... */ } +void Logger::error(const char* msg) { /* ... */ } + +} // namespace logging +} // namespace app +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + class_qns = _get_class_qns(mock_ingestor) + header_classes = [qn for qn in class_qns if "include" in qn and "Logger" in qn] + assert header_classes, f"Expected Logger class in include/, got: {class_qns}" + header_class_qn = header_classes[0] + + edges = _get_defines_method_edges(mock_ingestor) + methods_linked = { + mq.split(SEPARATOR_DOT)[-1] for cq, mq in edges if cq == header_class_qn + } + + assert "info" in methods_linked, ( + f"'info' not linked to header Logger. Methods: {methods_linked}" + ) + assert "error" in methods_linked, ( + f"'error' not linked to header Logger. Methods: {methods_linked}" + ) + + +# --------------------------------------------------------------------------- +# Test: no orphan methods remain (aggregate check) +# --------------------------------------------------------------------------- + + +def test_no_orphan_methods_across_files( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """Every Method node must have at least one incoming DEFINES_METHOD edge.""" + include = cpp_cross_file_project / "include" + include.mkdir() + src = cpp_cross_file_project / "src" + src.mkdir() + + (include / "Widget.h").write_text( + encoding="utf-8", + data="""\ +#pragma once + +class Widget { +public: + void draw(); + void resize(int w, int h); + void hide(); +}; +""", + ) + + (src / "Widget.cpp").write_text( + encoding="utf-8", + data="""\ +#include "Widget.h" + +void Widget::draw() { /* ... */ } +void Widget::resize(int w, int h) { /* ... */ } +void Widget::hide() { /* ... */ } +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + method_qns = _get_method_qns(mock_ingestor) + edges = _get_defines_method_edges(mock_ingestor) + methods_with_edges = {mq for _, mq in edges} + + orphans = method_qns - methods_with_edges + # Filter to only methods belonging to Widget (other methods from inline + # definitions always have edges). + widget_orphans = {qn for qn in orphans if "Widget" in qn} + assert not widget_orphans, ( + f"Found orphan Widget Method nodes with no DEFINES_METHOD edge: " + f"{widget_orphans}" + ) + + +# --------------------------------------------------------------------------- +# Test: same-file out-of-class still works (regression) +# --------------------------------------------------------------------------- + + +def test_same_file_out_of_class_still_works( + cpp_cross_file_project: Path, + mock_ingestor: MagicMock, +) -> None: + """When class and implementations are in the same .cpp, nothing breaks.""" + (cpp_cross_file_project / "single.cpp").write_text( + encoding="utf-8", + data="""\ +class Foo { +public: + void bar(); + int baz(int x); +}; + +void Foo::bar() { /* ... */ } +int Foo::baz(int x) { return x; } +""", + ) + + run_updater(cpp_cross_file_project, mock_ingestor) + + method_names = _method_names_for_class(mock_ingestor, "Foo") + assert "bar" in method_names, f"Expected 'bar', got: {method_names}" + assert "baz" in method_names, f"Expected 'baz', got: {method_names}" diff --git a/codebase_rag/tests/test_cypher_validation.py b/codebase_rag/tests/test_cypher_validation.py new file mode 100644 index 000000000..af99c798e --- /dev/null +++ b/codebase_rag/tests/test_cypher_validation.py @@ -0,0 +1,165 @@ +import re + +import pytest + +from codebase_rag import constants as cs +from codebase_rag import exceptions as ex +from codebase_rag.services.llm import ( + _build_keyword_pattern, + _validate_cypher_read_only, +) + + +class TestBuildKeywordPattern: + def test_single_word_uses_word_boundaries(self) -> None: + pattern = _build_keyword_pattern("DELETE") + assert pattern.search("DELETE n") is not None + assert pattern.search("XDELETE") is None + assert pattern.search("DELETEX") is None + + def test_multi_word_allows_whitespace_between_parts(self) -> None: + pattern = _build_keyword_pattern("LOAD CSV") + assert pattern.search("LOAD CSV") is not None + assert pattern.search("LOAD CSV") is not None + assert pattern.search("LOAD\nCSV") is not None + assert pattern.search("LOAD\t CSV") is not None + + def test_multi_word_allows_block_comment_between_parts(self) -> None: + pattern = _build_keyword_pattern("LOAD CSV") + assert pattern.search("LOAD/*bypass*/CSV") is not None + assert pattern.search("LOAD /* comment */ CSV") is not None + + def test_multi_word_allows_single_line_comment_between_parts(self) -> None: + pattern = _build_keyword_pattern("LOAD CSV") + assert pattern.search("LOAD //comment\nCSV") is not None + assert pattern.search("LOAD //\nCSV") is not None + + def test_multi_word_respects_word_boundaries(self) -> None: + pattern = _build_keyword_pattern("LOAD CSV") + assert pattern.search("PRELOAD CSV") is None + assert pattern.search("LOAD CSVX") is None + + def test_single_word_is_case_sensitive_on_input(self) -> None: + pattern = _build_keyword_pattern("DELETE") + assert pattern.search("DELETE") is not None + assert pattern.search("delete") is None + + def test_returns_compiled_pattern(self) -> None: + pattern = _build_keyword_pattern("SET") + assert isinstance(pattern, re.Pattern) + + def test_multi_word_has_dotall_flag(self) -> None: + pattern = _build_keyword_pattern("CREATE INDEX") + assert pattern.flags & re.DOTALL + + def test_all_dangerous_keywords_produce_valid_patterns(self) -> None: + for kw in cs.CYPHER_DANGEROUS_KEYWORDS: + pattern = _build_keyword_pattern(kw) + assert pattern.search(kw) is not None + + +class TestValidateCypherReadOnly: + def test_safe_match_query_passes(self) -> None: + _validate_cypher_read_only("MATCH (n) RETURN n;") + + def test_safe_match_with_where_passes(self) -> None: + _validate_cypher_read_only("MATCH (n:Function) WHERE n.name = 'foo' RETURN n;") + + def test_safe_optional_match_passes(self) -> None: + _validate_cypher_read_only( + "MATCH (a)-[:CALLS]->(b) OPTIONAL MATCH (b)-[:DEFINES]->(c) RETURN a, b, c;" + ) + + @pytest.mark.parametrize( + "keyword", + sorted(cs.CYPHER_DANGEROUS_KEYWORDS), + ) + def test_rejects_all_dangerous_keywords(self, keyword: str) -> None: + query = f"MATCH (n) {keyword} n;" + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only(query) + + def test_rejects_delete(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="DELETE"): + _validate_cypher_read_only("MATCH (n) DELETE n;") + + def test_rejects_detach_delete(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("MATCH (n) DETACH DELETE n;") + + def test_rejects_drop(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="DROP"): + _validate_cypher_read_only("MATCH (n) DROP INDEX idx;") + + def test_rejects_set(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="SET"): + _validate_cypher_read_only("MATCH (n) SET n.name = 'x';") + + def test_rejects_merge(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="MERGE"): + _validate_cypher_read_only("MERGE (n:Node {id: 1});") + + def test_rejects_create(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="CREATE"): + _validate_cypher_read_only("CREATE (n:Node {name: 'test'});") + + def test_rejects_load_csv(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="LOAD CSV"): + _validate_cypher_read_only( + "LOAD CSV FROM 'http://evil.com/data.csv' AS row;" + ) + + def test_rejects_create_index(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="CREATE INDEX"): + _validate_cypher_read_only("CREATE INDEX ON :Node(name);") + + def test_case_insensitive(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("match (n) delete n;") + + def test_rejects_block_comment_bypass(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("LOAD/*bypass*/CSV FROM 'http://evil.com';") + + def test_rejects_single_line_comment_bypass(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("LOAD //bypass\nCSV FROM 'http://evil.com';") + + def test_does_not_flag_substring_matches(self) -> None: + _validate_cypher_read_only("MATCH (n) WHERE n.name = 'DATASET' RETURN n;") + + def test_does_not_flag_reset(self) -> None: + _validate_cypher_read_only("MATCH (n) WHERE n.name = 'RESET' RETURN n;") + + def test_does_not_flag_created_at(self) -> None: + _validate_cypher_read_only("MATCH (n) WHERE n.created_at > 0 RETURN n;") + + def test_error_includes_keyword_and_query(self) -> None: + query = "MATCH (n) DELETE n;" + with pytest.raises(ex.LLMGenerationError, match="DELETE") as exc_info: + _validate_cypher_read_only(query) + assert query in str(exc_info.value) + + def test_rejects_foreach(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="FOREACH"): + _validate_cypher_read_only( + "MATCH p=(a)-[*]->(b) FOREACH (n IN nodes(p) | SET n.marked = true);" + ) + + def test_rejects_remove(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="REMOVE"): + _validate_cypher_read_only("MATCH (n) REMOVE n.prop;") + + def test_rejects_call(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="CALL"): + _validate_cypher_read_only("CALL db.schema.visualization();") + + def test_rejects_create_constraint(self) -> None: + with pytest.raises(ex.LLMGenerationError, match="CREATE CONSTRAINT"): + _validate_cypher_read_only( + "CREATE CONSTRAINT ON (n:Node) ASSERT n.id IS UNIQUE;" + ) + + def test_rejects_multiline_block_comment_bypass(self) -> None: + with pytest.raises(ex.LLMGenerationError): + _validate_cypher_read_only("LOAD/*\nbypass\n*/CSV FROM 'http://evil.com';") diff --git a/codebase_rag/tests/test_directory_lister.py b/codebase_rag/tests/test_directory_lister.py index 9a7f480bc..40759be36 100644 --- a/codebase_rag/tests/test_directory_lister.py +++ b/codebase_rag/tests/test_directory_lister.py @@ -5,6 +5,7 @@ import pytest from pydantic_ai import Tool +from codebase_rag import tool_errors as te from codebase_rag.tools.directory_lister import ( DirectoryLister, create_directory_lister_tool, @@ -113,6 +114,24 @@ def test_list_with_hidden_files( assert ".hidden_file" in result assert "visible_file" in result + def test_list_directory_returns_error_for_path_outside_root( + self, directory_lister: DirectoryLister + ) -> None: + result = directory_lister.list_directory_contents("../../../etc") + expected = te.DIRECTORY_PATH_OUTSIDE_ROOT.format( + path="../../../etc", root=directory_lister.project_root + ) + assert result == expected + + def test_list_directory_returns_error_for_absolute_path_outside_root( + self, directory_lister: DirectoryLister + ) -> None: + result = directory_lister.list_directory_contents("/etc/passwd") + expected = te.DIRECTORY_PATH_OUTSIDE_ROOT.format( + path="/etc/passwd", root=directory_lister.project_root + ) + assert result == expected + class TestGetSafePath: def test_safe_path_with_relative_path( diff --git a/codebase_rag/tests/test_embedder.py b/codebase_rag/tests/test_embedder.py index 401044582..092197301 100644 --- a/codebase_rag/tests/test_embedder.py +++ b/codebase_rag/tests/test_embedder.py @@ -1,10 +1,13 @@ from __future__ import annotations +import tempfile from collections.abc import Generator +from pathlib import Path from unittest.mock import MagicMock, patch import pytest +from codebase_rag.embedder import EmbeddingCache, clear_embedding_cache from codebase_rag.utils.dependencies import has_torch, has_transformers @@ -44,6 +47,13 @@ def reset_model_cache() -> Generator[None, None, None]: get_model.cache_clear() +@pytest.fixture(autouse=True) +def reset_cache() -> Generator[None, None, None]: + clear_embedding_cache() + yield + clear_embedding_cache() + + @pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") def test_embed_code_returns_768_dimensional_vector( mock_unixcoder: MagicMock, reset_model_cache: None @@ -192,3 +202,305 @@ def test_embed_code_raises_without_dependencies() -> None: with pytest.raises(RuntimeError, match="Semantic search requires"): embed_code("x = 1") + + +def test_embedding_cache_put_and_get() -> None: + cache = EmbeddingCache() + embedding = [0.1, 0.2, 0.3] + cache.put("def foo(): pass", embedding) + assert cache.get("def foo(): pass") == embedding + + +def test_embedding_cache_miss_returns_none() -> None: + cache = EmbeddingCache() + assert cache.get("unknown code") is None + + +def test_embedding_cache_different_content_different_key() -> None: + cache = EmbeddingCache() + cache.put("code_a", [1.0]) + cache.put("code_b", [2.0]) + assert cache.get("code_a") == [1.0] + assert cache.get("code_b") == [2.0] + + +def test_embedding_cache_overwrite() -> None: + cache = EmbeddingCache() + cache.put("code_a", [1.0]) + cache.put("code_a", [9.9]) + assert cache.get("code_a") == [9.9] + + +def test_embedding_cache_len() -> None: + cache = EmbeddingCache() + assert len(cache) == 0 + cache.put("a", [1.0]) + assert len(cache) == 1 + cache.put("b", [2.0]) + assert len(cache) == 2 + + +def test_embedding_cache_clear() -> None: + cache = EmbeddingCache() + cache.put("a", [1.0]) + cache.put("b", [2.0]) + cache.clear() + assert len(cache) == 0 + assert cache.get("a") is None + + +def test_embedding_cache_get_many() -> None: + cache = EmbeddingCache() + cache.put("a", [1.0]) + cache.put("b", [2.0]) + results = cache.get_many(["a", "c", "b"]) + assert results == {0: [1.0], 2: [2.0]} + + +def test_embedding_cache_put_many() -> None: + cache = EmbeddingCache() + cache.put_many(["x", "y"], [[1.0], [2.0]]) + assert cache.get("x") == [1.0] + assert cache.get("y") == [2.0] + + +def test_embedding_cache_save_and_load() -> None: + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "test_cache.json" + cache = EmbeddingCache(path=cache_path) + cache.put("hello", [0.5, 0.6]) + cache.save() + + assert cache_path.exists() + + cache2 = EmbeddingCache(path=cache_path) + cache2.load() + assert cache2.get("hello") == [0.5, 0.6] + + +def test_embedding_cache_load_nonexistent_path() -> None: + cache = EmbeddingCache(path=Path("/nonexistent/path/cache.json")) + cache.load() + assert len(cache) == 0 + + +def test_embedding_cache_load_corrupt_file() -> None: + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "corrupt.json" + cache_path.write_text("not valid json data", encoding="utf-8") + cache = EmbeddingCache(path=cache_path) + cache.load() + assert len(cache) == 0 + + +def test_embedding_cache_save_no_path() -> None: + cache = EmbeddingCache(path=None) + cache.put("a", [1.0]) + cache.save() + + +def test_embedding_cache_load_no_path() -> None: + cache = EmbeddingCache(path=None) + cache.load() + assert len(cache) == 0 + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_uses_cache( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code, get_embedding_cache + + mock_embedding = torch.zeros(1, 768) + mock_unixcoder.return_value = (torch.zeros(1, 5, 768), mock_embedding) + + cache = get_embedding_cache() + cache.put("cached_code", [0.42] * 768) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + result = embed_code("cached_code") + + assert result == [0.42] * 768 + mock_unixcoder.tokenize.assert_not_called() + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_populates_cache( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code, get_embedding_cache + + mock_embedding = torch.ones(1, 768) + mock_unixcoder.return_value = (torch.zeros(1, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + embed_code("new_code") + + cache = get_embedding_cache() + assert cache.get("new_code") is not None + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_empty_list(reset_model_cache: None) -> None: + from codebase_rag.embedder import embed_code_batch + + assert embed_code_batch([]) == [] + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_returns_correct_count( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch + + snippets = ["def a(): pass", "def b(): pass", "def c(): pass"] + mock_unixcoder.tokenize.return_value = [[1, 2, 3]] * 3 + mock_embedding = torch.zeros(3, 768) + mock_unixcoder.return_value = (torch.zeros(3, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + results = embed_code_batch(snippets) + + assert len(results) == 3 + assert all(len(emb) == 768 for emb in results) + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_uses_padding( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch + + snippets = ["short", "longer code here"] + mock_unixcoder.tokenize.return_value = [[1, 2, 3, 0, 0], [1, 2, 3, 4, 5]] + mock_embedding = torch.zeros(2, 768) + mock_unixcoder.return_value = (torch.zeros(2, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + embed_code_batch(snippets) + + mock_unixcoder.tokenize.assert_called_once_with( + snippets, max_length=512, padding=True + ) + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_cache_hit( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + from codebase_rag.embedder import embed_code_batch, get_embedding_cache + + cache = get_embedding_cache() + cache.put("a", [1.0] * 768) + cache.put("b", [2.0] * 768) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + results = embed_code_batch(["a", "b"]) + + mock_unixcoder.tokenize.assert_not_called() + assert results == [[1.0] * 768, [2.0] * 768] + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_partial_cache( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch, get_embedding_cache + + cache = get_embedding_cache() + cache.put("a", [1.0] * 768) + + mock_unixcoder.tokenize.return_value = [[1, 2, 3]] + mock_embedding = torch.full((1, 768), 3.0) + mock_unixcoder.return_value = (torch.zeros(1, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + results = embed_code_batch(["a", "b"]) + + assert results[0] == [1.0] * 768 + assert results[1] == [3.0] * 768 + mock_unixcoder.tokenize.assert_called_once_with(["b"], max_length=512, padding=True) + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_populates_cache( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch, get_embedding_cache + + mock_unixcoder.tokenize.return_value = [[1, 2, 3]] + mock_embedding = torch.ones(1, 768) + mock_unixcoder.return_value = (torch.zeros(1, 5, 768), mock_embedding) + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + embed_code_batch(["new_snippet"]) + + cache = get_embedding_cache() + assert cache.get("new_snippet") is not None + + +@pytest.mark.skipif(not _has_semantic_deps(), reason="torch/transformers not installed") +def test_embed_code_batch_respects_batch_size( + mock_unixcoder: MagicMock, reset_model_cache: None +) -> None: + import torch + + from codebase_rag.embedder import embed_code_batch + + snippets = [f"def f{i}(): pass" for i in range(5)] + + def side_effect_tokenize(batch: list[str], **kwargs: int | bool) -> list[list[int]]: + return [[1, 2, 3]] * len(batch) + + mock_unixcoder.tokenize.side_effect = side_effect_tokenize + + def side_effect_forward(tensor: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + n = tensor.shape[0] + return torch.zeros(n, 5, 768), torch.zeros(n, 768) + + mock_unixcoder.side_effect = side_effect_forward + + with patch("codebase_rag.embedder.get_model", return_value=mock_unixcoder): + results = embed_code_batch(snippets, batch_size=2) + + assert len(results) == 5 + assert mock_unixcoder.tokenize.call_count == 3 + + +def test_embed_code_batch_raises_without_dependencies() -> None: + if _has_semantic_deps(): + pytest.skip("Dependencies are installed") + + from codebase_rag.embedder import embed_code_batch + + with pytest.raises(RuntimeError, match="Semantic search requires"): + embed_code_batch(["x = 1"]) + + +def test_embedding_cache_persistence_roundtrip() -> None: + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "subdir" / "cache.json" + + cache1 = EmbeddingCache(path=cache_path) + cache1.put("fn_a", [0.1, 0.2]) + cache1.put("fn_b", [0.3, 0.4]) + cache1.save() + + cache2 = EmbeddingCache(path=cache_path) + cache2.load() + assert cache2.get("fn_a") == [0.1, 0.2] + assert cache2.get("fn_b") == [0.3, 0.4] + assert cache2.get("fn_c") is None + assert len(cache2) == 2 diff --git a/codebase_rag/tests/test_function_ingest.py b/codebase_rag/tests/test_function_ingest.py index 814380ce4..ef2556325 100644 --- a/codebase_rag/tests/test_function_ingest.py +++ b/codebase_rag/tests/test_function_ingest.py @@ -234,7 +234,7 @@ def inner_func(): lang_config = queries[cs.SupportedLanguage.PYTHON]["config"] result = definition_processor._is_method(inner_func, lang_config) - assert result is True + assert result is False class TestFormatNestedQn: @@ -466,7 +466,9 @@ def test_basic_function_props( is_exported=False, ) - result = definition_processor._build_function_props(func_node, resolution) + result = definition_processor._build_function_props( + func_node, resolution, "proj.module" + ) assert result["qualified_name"] == "proj.module.my_function" assert result["name"] == "my_function" @@ -497,7 +499,9 @@ def test_exported_function_props( is_exported=True, ) - result = definition_processor._build_function_props(func_node, resolution) + result = definition_processor._build_function_props( + func_node, resolution, "proj.module" + ) assert result["is_exported"] is True diff --git a/codebase_rag/tests/test_github_issues_integration.py b/codebase_rag/tests/test_github_issues_integration.py index 2b6bc081f..423945657 100644 --- a/codebase_rag/tests/test_github_issues_integration.py +++ b/codebase_rag/tests/test_github_issues_integration.py @@ -1,7 +1,10 @@ import os from unittest.mock import patch +import pytest + from codebase_rag.config import AppConfig +from codebase_rag.constants import GoogleProviderType class TestGitHubIssuesIntegration: @@ -142,9 +145,6 @@ def test_openai_compatible_endpoints(self) -> None: assert orchestrator.endpoint == "https://api.together.xyz/v1" def test_vertex_ai_enterprise_scenario(self) -> None: - """ - Test enterprise Vertex AI configuration scenario. - """ env_content = { "ORCHESTRATOR_PROVIDER": "google", "ORCHESTRATOR_MODEL": "gemini-2.5-pro", @@ -162,9 +162,63 @@ def test_vertex_ai_enterprise_scenario(self) -> None: assert orchestrator.model_id == "gemini-2.5-pro" assert orchestrator.project_id == "my-enterprise-project" assert orchestrator.region == "us-central1" - assert orchestrator.provider_type == "vertex" + assert orchestrator.provider_type == GoogleProviderType.VERTEX assert orchestrator.service_account_file == "/path/to/service-account.json" + def test_vertex_ai_skips_api_key_validation(self) -> None: + env_content = { + "ORCHESTRATOR_PROVIDER": "google", + "ORCHESTRATOR_MODEL": "gemini-2.5-pro", + "ORCHESTRATOR_PROJECT_ID": "my-project", + "ORCHESTRATOR_REGION": "us-central1", + "ORCHESTRATOR_PROVIDER_TYPE": "vertex", + "ORCHESTRATOR_SERVICE_ACCOUNT_FILE": "/path/to/sa.json", + "CYPHER_PROVIDER": "google", + "CYPHER_MODEL": "gemini-2.5-flash", + "CYPHER_PROJECT_ID": "my-project", + "CYPHER_REGION": "us-central1", + "CYPHER_PROVIDER_TYPE": "vertex", + "CYPHER_SERVICE_ACCOUNT_FILE": "/path/to/sa.json", + } + + with patch.dict(os.environ, env_content): + config = AppConfig() + + orchestrator = config.active_orchestrator_config + orchestrator.validate_api_key("orchestrator") + + cypher = config.active_cypher_config + cypher.validate_api_key("cypher") + + def test_vertex_ai_with_google_api_key_env_does_not_error(self) -> None: + env_content = { + "ORCHESTRATOR_PROVIDER": "google", + "ORCHESTRATOR_MODEL": "gemini-2.5-pro", + "ORCHESTRATOR_PROJECT_ID": "my-project", + "ORCHESTRATOR_PROVIDER_TYPE": "vertex", + "ORCHESTRATOR_SERVICE_ACCOUNT_FILE": "/path/to/sa.json", + "GOOGLE_API_KEY": "stray-key-from-env", + } + + with patch.dict(os.environ, env_content): + config = AppConfig() + orchestrator = config.active_orchestrator_config + orchestrator.validate_api_key("orchestrator") + + def test_google_gla_without_api_key_raises(self) -> None: + env_content = { + "ORCHESTRATOR_PROVIDER": "google", + "ORCHESTRATOR_MODEL": "gemini-2.5-pro", + "ORCHESTRATOR_PROVIDER_TYPE": "gla", + "ORCHESTRATOR_API_KEY": "", + } + + with patch.dict(os.environ, env_content): + config = AppConfig() + orchestrator = config.active_orchestrator_config + with pytest.raises(ValueError, match="API Key Missing"): + orchestrator.validate_api_key("orchestrator") + def test_reasoning_model_thinking_budget(self) -> None: """ Test configuration for reasoning models with thinking budget. diff --git a/codebase_rag/tests/test_graph_service.py b/codebase_rag/tests/test_graph_service.py index c31b30741..b5f7b85e7 100644 --- a/codebase_rag/tests/test_graph_service.py +++ b/codebase_rag/tests/test_graph_service.py @@ -5,7 +5,13 @@ import pytest from codebase_rag.constants import NODE_UNIQUE_CONSTRAINTS -from codebase_rag.cypher_queries import wrap_with_unwind +from codebase_rag.cypher_queries import ( + build_create_node_query, + build_create_relationship_query, + build_merge_node_query, + build_merge_relationship_query, + wrap_with_unwind, +) from codebase_rag.services.graph_service import MemgraphIngestor @@ -38,13 +44,63 @@ def test_init_creates_empty_buffers(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) assert ingestor.node_buffer == [] - assert ingestor.relationship_buffer == [] + assert ingestor._rel_count == 0 def test_init_conn_is_none(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) assert ingestor.conn is None + def test_init_stores_auth_credentials(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, username="user", password="pass" + ) + + assert ingestor._username == "user" + assert ingestor._password == "pass" + + def test_init_defaults_auth_to_none(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + + assert ingestor._username is None + assert ingestor._password is None + + def test_init_raises_for_username_without_password(self) -> None: + with pytest.raises(ValueError, match="Both username and password"): + MemgraphIngestor(host="localhost", port=7687, username="user") + + def test_init_raises_for_password_without_username(self) -> None: + with pytest.raises(ValueError, match="Both username and password"): + MemgraphIngestor(host="localhost", port=7687, password="pass") + + def test_init_normalizes_empty_strings_to_none(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, username="", password="" + ) + + assert ingestor._username is None + assert ingestor._password is None + + def test_init_normalizes_whitespace_only_to_none(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, username=" ", password=" " + ) + + assert ingestor._username is None + assert ingestor._password is None + + def test_init_strips_whitespace_from_credentials(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, username=" user ", password=" pass " + ) + + assert ingestor._username == "user" + assert ingestor._password == "pass" + + def test_init_raises_for_empty_password_with_valid_username(self) -> None: + with pytest.raises(ValueError, match="Both username and password"): + MemgraphIngestor(host="localhost", port=7687, username="user", password="") + class TestContextManager: def test_enter_connects_to_memgraph(self) -> None: @@ -60,12 +116,36 @@ def test_enter_connects_to_memgraph(self) -> None: assert mock_conn.autocommit is True assert result is ingestor + def test_enter_passes_auth_when_provided(self) -> None: + with patch("codebase_rag.services.graph_service.mgclient") as mock_mgclient: + mock_conn = MagicMock() + mock_mgclient.connect.return_value = mock_conn + + ingestor = MemgraphIngestor( + host="testhost", port=1234, username="user", password="pass" + ) + ingestor.__enter__() + + mock_mgclient.connect.assert_called_once_with( + host="testhost", port=1234, username="user", password="pass" + ) + + def test_enter_omits_auth_when_not_provided(self) -> None: + with patch("codebase_rag.services.graph_service.mgclient") as mock_mgclient: + mock_conn = MagicMock() + mock_mgclient.connect.return_value = mock_conn + + ingestor = MemgraphIngestor(host="testhost", port=1234) + ingestor.__enter__() + + mock_mgclient.connect.assert_called_once_with(host="testhost", port=1234) + def test_exit_flushes_and_closes_connection(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) mock_conn = MagicMock() ingestor.conn = mock_conn - with patch.object(ingestor, "flush_all") as mock_flush: + with patch.object(MemgraphIngestor, "flush_all") as mock_flush: ingestor.__exit__(None, None, None) mock_flush.assert_called_once() @@ -76,7 +156,7 @@ def test_exit_logs_error_on_exception(self) -> None: mock_conn = MagicMock() ingestor.conn = mock_conn - with patch.object(ingestor, "flush_all"): + with patch.object(MemgraphIngestor, "flush_all"): ingestor.__exit__(ValueError, ValueError("test error"), None) mock_conn.close.assert_called_once() @@ -85,7 +165,7 @@ def test_exit_handles_none_connection(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) ingestor.conn = None - with patch.object(ingestor, "flush_all"): + with patch.object(MemgraphIngestor, "flush_all"): ingestor.__exit__(None, None, None) @@ -206,19 +286,13 @@ def test_suppresses_already_exists_errors_in_logs(self) -> None: ingestor._execute_query("CREATE CONSTRAINT") -class TestExecuteBatch: - def test_returns_early_when_not_connected(self) -> None: - ingestor = MemgraphIngestor(host="localhost", port=7687) - ingestor.conn = None - - ingestor._execute_batch("MERGE (n:Test)", [{"id": 1}]) - +class TestExecuteBatchOn: def test_returns_early_when_params_empty(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) mock_conn = MagicMock() ingestor.conn = mock_conn - ingestor._execute_batch("MERGE (n:Test)", []) + ingestor._execute_batch_on(mock_conn, "MERGE (n:Test)", []) mock_conn.cursor.assert_not_called() @@ -229,7 +303,9 @@ def test_wraps_query_with_unwind(self) -> None: mock_conn.cursor.return_value = mock_cursor ingestor.conn = mock_conn - ingestor._execute_batch("MERGE (n:Test {id: row.id})", [{"id": 1}, {"id": 2}]) + ingestor._execute_batch_on( + mock_conn, "MERGE (n:Test {id: row.id})", [{"id": 1}, {"id": 2}] + ) call_args = mock_cursor.execute.call_args[0] assert call_args[0] == wrap_with_unwind("MERGE (n:Test {id: row.id})") @@ -242,7 +318,7 @@ def test_closes_cursor_on_success(self) -> None: mock_conn.cursor.return_value = mock_cursor ingestor.conn = mock_conn - ingestor._execute_batch("MERGE (n:Test)", [{"id": 1}]) + ingestor._execute_batch_on(mock_conn, "MERGE (n:Test)", [{"id": 1}]) mock_cursor.close.assert_called_once() @@ -251,7 +327,7 @@ class TestCleanDatabase: def test_executes_delete_query(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) - with patch.object(ingestor, "_execute_query") as mock_execute: + with patch.object(MemgraphIngestor, "_execute_query") as mock_execute: ingestor.clean_database() mock_execute.assert_called_once_with("MATCH (n) DETACH DELETE n;") @@ -265,7 +341,9 @@ def test_creates_constraint_for_each_node_type(self) -> None: def capture_query(query: str) -> None: executed_queries.append(query) - with patch.object(ingestor, "_execute_query", side_effect=capture_query): + with patch.object( + MemgraphIngestor, "_execute_query", side_effect=capture_query + ): ingestor.ensure_constraints() for label, prop in NODE_UNIQUE_CONSTRAINTS.items(): @@ -282,7 +360,9 @@ def fail_then_succeed(query: str) -> None: if call_count == 1: raise RuntimeError("Constraint already exists") - with patch.object(ingestor, "_execute_query", side_effect=fail_then_succeed): + with patch.object( + MemgraphIngestor, "_execute_query", side_effect=fail_then_succeed + ): ingestor.ensure_constraints() expected_queries = len(NODE_UNIQUE_CONSTRAINTS) * 2 @@ -384,7 +464,7 @@ def mock_fetch_all(query: str, params: dict | None = None) -> list[dict]: return [{"node_id": 1}, {"node_id": 2}, {"node_id": 3}] return [{"from_id": 1, "to_id": 2}] - with patch.object(ingestor, "fetch_all", side_effect=mock_fetch_all): + with patch.object(MemgraphIngestor, "fetch_all", side_effect=mock_fetch_all): result = ingestor.export_graph_to_dict() assert result["metadata"]["total_nodes"] == 3 @@ -396,8 +476,8 @@ def test_calls_flush_nodes_and_flush_relationships(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) with ( - patch.object(ingestor, "flush_nodes") as mock_nodes, - patch.object(ingestor, "flush_relationships") as mock_rels, + patch.object(MemgraphIngestor, "flush_nodes") as mock_nodes, + patch.object(MemgraphIngestor, "flush_relationships") as mock_rels, ): ingestor.flush_all() @@ -410,7 +490,7 @@ def test_fetch_all_delegates_to_execute_query(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) with patch.object( - ingestor, "_execute_query", return_value=[{"n": "result"}] + MemgraphIngestor, "_execute_query", return_value=[{"n": "result"}] ) as mock_exec: result = ingestor.fetch_all("MATCH (n) RETURN n", {"limit": 10}) @@ -420,7 +500,7 @@ def test_fetch_all_delegates_to_execute_query(self) -> None: def test_execute_write_delegates_to_execute_query(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) - with patch.object(ingestor, "_execute_query") as mock_exec: + with patch.object(MemgraphIngestor, "_execute_query") as mock_exec: ingestor.execute_write("CREATE (n:Test)", {"name": "test"}) mock_exec.assert_called_once_with("CREATE (n:Test)", {"name": "test"}) @@ -434,3 +514,187 @@ def test_returns_iso_format_timestamp(self) -> None: assert "T" in result assert len(result) > 10 + + +class TestCreateMode: + def test_default_use_merge_is_true(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + assert ingestor._use_merge is True + + def test_use_merge_false(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687, use_merge=False) + assert ingestor._use_merge is False + + def test_flush_nodes_uses_merge_query_by_default(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687, batch_size=10) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + ingestor.conn = mock_conn + + ingestor.node_buffer.append(("File", {"path": "/test.py", "name": "test"})) + ingestor.flush_nodes() + + call_args = mock_cursor.execute.call_args[0][0] + assert "MERGE" in call_args + assert "CREATE" not in call_args.split("MERGE")[0] + + def test_flush_nodes_uses_create_query_when_merge_disabled(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, batch_size=10, use_merge=False + ) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + ingestor.conn = mock_conn + + ingestor.node_buffer.append(("File", {"path": "/test.py", "name": "test"})) + ingestor.flush_nodes() + + call_args = mock_cursor.execute.call_args[0][0] + assert "CREATE" in call_args + assert "MERGE" not in call_args + + def test_flush_relationships_uses_merge_query_by_default(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687, batch_size=10) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + mock_cursor.description = [MagicMock(name="created")] + mock_cursor.description[0].name = "created" + mock_cursor.fetchall.return_value = [(1,)] + ingestor.conn = mock_conn + + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + ingestor.flush_relationships() + + call_args = mock_cursor.execute.call_args[0][0] + assert "MERGE" in call_args + + def test_flush_relationships_uses_create_query_when_merge_disabled(self) -> None: + ingestor = MemgraphIngestor( + host="localhost", port=7687, batch_size=10, use_merge=False + ) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + mock_cursor.description = [MagicMock(name="created")] + mock_cursor.description[0].name = "created" + mock_cursor.fetchall.return_value = [(1,)] + ingestor.conn = mock_conn + + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + ingestor.flush_relationships() + + call_args = mock_cursor.execute.call_args[0][0] + assert "CREATE" in call_args + assert "MERGE" not in call_args + + +class TestPreGroupedRelBuffer: + def test_rel_groups_populated_on_ensure(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + assert len(ingestor._rel_groups) == 1 + + def test_rel_groups_groups_by_pattern(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/c.py") + ) + ingestor.ensure_relationship_batch( + ("Module", "qualified_name", "mod_a"), + "DEFINES", + ("Function", "qualified_name", "func_b"), + ) + assert len(ingestor._rel_groups) == 2 + pattern = ("File", "path", "IMPORTS", "File", "path") + assert len(ingestor._rel_groups[pattern]) == 2 + + def test_rel_groups_cleared_after_flush(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + mock_conn = MagicMock() + mock_cursor = MagicMock() + mock_conn.cursor.return_value = mock_cursor + mock_cursor.description = [MagicMock(name="created")] + mock_cursor.description[0].name = "created" + mock_cursor.fetchall.return_value = [(1,)] + ingestor.conn = mock_conn + + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), "IMPORTS", ("File", "path", "/b.py") + ) + ingestor.flush_relationships() + + assert len(ingestor._rel_groups) == 0 + + def test_rel_groups_empty_on_init(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + assert len(ingestor._rel_groups) == 0 + + def test_rel_groups_correct_batch_row_values(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + ingestor.ensure_relationship_batch( + ("File", "path", "/a.py"), + "IMPORTS", + ("File", "path", "/b.py"), + {"weight": 1}, + ) + pattern = ("File", "path", "IMPORTS", "File", "path") + rows = ingestor._rel_groups[pattern] + assert len(rows) == 1 + assert rows[0]["from_val"] == "/a.py" + assert rows[0]["to_val"] == "/b.py" + assert rows[0]["props"] == {"weight": 1} + + +class TestSlots: + def test_has_slots(self) -> None: + assert hasattr(MemgraphIngestor, "__slots__") + + def test_no_dict(self) -> None: + ingestor = MemgraphIngestor(host="localhost", port=7687) + assert not hasattr(ingestor, "__dict__") + + +class TestCypherCreateQueries: + def test_build_create_node_query(self) -> None: + query = build_create_node_query("File", "path") + assert "CREATE" in query + assert "MERGE" not in query + assert "path: row.id" in query + + def test_build_create_relationship_query(self) -> None: + query = build_create_relationship_query( + "File", "path", "IMPORTS", "File", "path" + ) + assert "CREATE (a)-[r:IMPORTS]->(b)" in query + assert "MERGE" not in query + + def test_build_create_relationship_query_with_props(self) -> None: + query = build_create_relationship_query( + "File", "path", "IMPORTS", "File", "path", has_props=True + ) + assert "SET r += row.props" in query + assert "CREATE (a)-[r:IMPORTS]->(b)" in query + + def test_build_merge_node_query_unchanged(self) -> None: + query = build_merge_node_query("File", "path") + assert "MERGE" in query + assert "CREATE" not in query + + def test_build_merge_relationship_query_unchanged(self) -> None: + query = build_merge_relationship_query( + "File", "path", "IMPORTS", "File", "path" + ) + assert "MERGE" in query + assert "CREATE" not in query.replace("MERGE", "") diff --git a/codebase_rag/tests/test_graph_service_calls_failure_logging.py b/codebase_rag/tests/test_graph_service_calls_failure_logging.py index 2af717f06..6bb8f2e99 100644 --- a/codebase_rag/tests/test_graph_service_calls_failure_logging.py +++ b/codebase_rag/tests/test_graph_service_calls_failure_logging.py @@ -56,8 +56,8 @@ def test_calls_failure_logging_single_batch( ) with patch.object( - graph_service, - "_execute_batch_with_return", + MemgraphIngestor, + "_execute_batch_with_return_on", return_value=[{"created": 1}, {"created": 0}, {"created": 0}], ): graph_service.flush_relationships() @@ -72,13 +72,6 @@ def test_calls_failure_logging_single_batch( def test_calls_failure_logging_multiple_batches( graph_service: MemgraphIngestor, log_messages: list[str] ) -> None: - """Test that CALLS failures are logged correctly across multiple batches. - - This is the critical test case that validates the bug fix: - - Previously, the code used cumulative totals (total_attempted - total_successful) - - This would incorrectly report failures for batches after the first one - - Now it correctly uses batch-specific counts (len(params_list) - batch_successful) - """ graph_service.ensure_relationship_batch( ("Method", "qualified_name", "project.module.ClassA.methodA()"), "CALLS", @@ -104,14 +97,16 @@ def test_calls_failure_logging_multiple_batches( call_count = 0 def mock_execute_batch( - query: str, params_list: list[dict[str, Any]] + conn: Any, query: str, params_list: list[dict[str, Any]] ) -> list[dict[str, int]]: nonlocal call_count call_count += 1 return [{"created": 1}, {"created": 0}] with patch.object( - graph_service, "_execute_batch_with_return", side_effect=mock_execute_batch + MemgraphIngestor, + "_execute_batch_with_return_on", + side_effect=mock_execute_batch, ): graph_service.flush_relationships() @@ -127,7 +122,6 @@ def mock_execute_batch( def test_calls_success_no_failure_logging( graph_service: MemgraphIngestor, log_messages: list[str] ) -> None: - """Test that successful CALLS don't trigger failure warnings.""" graph_service.ensure_relationship_batch( ("Method", "qualified_name", "project.module.ClassA.methodA()"), "CALLS", @@ -140,8 +134,8 @@ def test_calls_success_no_failure_logging( ) with patch.object( - graph_service, - "_execute_batch_with_return", + MemgraphIngestor, + "_execute_batch_with_return_on", return_value=[{"created": 1}, {"created": 1}], ): graph_service.flush_relationships() @@ -154,7 +148,6 @@ def test_calls_success_no_failure_logging( def test_non_calls_relationships_no_failure_logging( graph_service: MemgraphIngestor, log_messages: list[str] ) -> None: - """Test that failures in non-CALLS relationships don't trigger CALLS-specific logging.""" graph_service.ensure_relationship_batch( ("Module", "qualified_name", "project.moduleA"), "IMPORTS", @@ -167,8 +160,8 @@ def test_non_calls_relationships_no_failure_logging( ) with patch.object( - graph_service, - "_execute_batch_with_return", + MemgraphIngestor, + "_execute_batch_with_return_on", return_value=[{"created": 1}, {"created": 0}], ): graph_service.flush_relationships() diff --git a/codebase_rag/tests/test_graph_updater_embeddings.py b/codebase_rag/tests/test_graph_updater_embeddings.py new file mode 100644 index 000000000..17b81815d --- /dev/null +++ b/codebase_rag/tests/test_graph_updater_embeddings.py @@ -0,0 +1,283 @@ +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.services.graph_service import MemgraphIngestor +from codebase_rag.types_defs import ResultRow + +MOCK_EMBEDDING = [0.1] * 768 + +_PATCH_DEPS = patch( + "codebase_rag.graph_updater.has_semantic_dependencies", return_value=True +) +_PATCH_EMBED = patch("codebase_rag.embedder.embed_code", return_value=MOCK_EMBEDDING) +_PATCH_STORE_BATCH = patch( + "codebase_rag.vector_store.store_embedding_batch", side_effect=lambda pts: len(pts) +) +_PATCH_RECONCILE = patch( + "codebase_rag.vector_store.verify_stored_ids", side_effect=lambda ids: ids +) + + +@pytest.fixture +def query_ingestor() -> MagicMock: + mock = MagicMock(spec=MemgraphIngestor) + mock.fetch_all = MagicMock(return_value=[]) + mock.execute_write = MagicMock() + return mock + + +@pytest.fixture +def updater_with_query(temp_repo: Path, query_ingestor: MagicMock) -> GraphUpdater: + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=query_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + +class TestCypherQueryEmbeddingsStructure: + def test_contains_starts_with_project_name(self) -> None: + assert "STARTS WITH" in cs.CYPHER_QUERY_EMBEDDINGS + assert "$project_name" in cs.CYPHER_QUERY_EMBEDDINGS + + def test_returns_required_columns(self) -> None: + query = cs.CYPHER_QUERY_EMBEDDINGS.upper() + for col in ["NODE_ID", "QUALIFIED_NAME", "START_LINE", "END_LINE", "PATH"]: + assert col in query + + def test_dot_concatenation_is_parenthesized(self) -> None: + assert "($project_name + '.')" in cs.CYPHER_QUERY_EMBEDDINGS + + def test_no_bare_starts_with_plus(self) -> None: + for line in cs.CYPHER_QUERY_EMBEDDINGS.splitlines(): + stripped = line.strip() + if "STARTS WITH" in stripped and "$project_name" in stripped: + assert "($project_name" in stripped, ( + f"$project_name + '.' must be parenthesized in: {stripped!r}" + ) + + +class TestGenerateSemanticEmbeddings: + @_PATCH_DEPS + @_PATCH_EMBED + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_passes_project_name_without_trailing_dot( + self, + _mock_reconcile: MagicMock, + _mock_store_batch: MagicMock, + _mock_embed: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + query_ingestor.fetch_all.return_value = [] + updater_with_query._generate_semantic_embeddings() + + params = query_ingestor.fetch_all.call_args[0][1] + project_name_param = params["project_name"] + assert not project_name_param.endswith("."), ( + f"project_name should not have trailing dot, got: {project_name_param!r}" + ) + + @_PATCH_DEPS + @_PATCH_EMBED + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_uses_cypher_query_embeddings_constant( + self, + _mock_reconcile: MagicMock, + _mock_store_batch: MagicMock, + _mock_embed: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + query_ingestor.fetch_all.return_value = [] + updater_with_query._generate_semantic_embeddings() + + query_arg = query_ingestor.fetch_all.call_args[0][0] + assert query_arg == cs.CYPHER_QUERY_EMBEDDINGS + + @patch("codebase_rag.graph_updater.has_semantic_dependencies", return_value=False) + def test_skips_when_no_semantic_dependencies( + self, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + updater_with_query._generate_semantic_embeddings() + query_ingestor.fetch_all.assert_not_called() + + @_PATCH_DEPS + @_PATCH_EMBED + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_returns_early_on_empty_results( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + _mock_embed: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + query_ingestor.fetch_all.return_value = [] + updater_with_query._generate_semantic_embeddings() + mock_store_batch.assert_not_called() + + @_PATCH_DEPS + @_PATCH_EMBED + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_embeds_valid_function_with_source( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + mock_embed: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + temp_repo: Path, + ) -> None: + (temp_repo / "module.py").write_text("def hello():\n return 42\n") + row: ResultRow = { + cs.KEY_NODE_ID: 1, + cs.KEY_QUALIFIED_NAME: "myproject.module.hello", + cs.KEY_START_LINE: 1, + cs.KEY_END_LINE: 2, + cs.KEY_PATH: "module.py", + } + query_ingestor.fetch_all.return_value = [row] + + updater_with_query._generate_semantic_embeddings() + + mock_embed.assert_called_once() + mock_store_batch.assert_called_once() + batch_arg = mock_store_batch.call_args[0][0] + assert len(batch_arg) == 1 + assert batch_arg[0] == (1, MOCK_EMBEDDING, "myproject.module.hello") + + @_PATCH_DEPS + @_PATCH_EMBED + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_skips_row_with_missing_source_info( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + mock_embed: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + row: ResultRow = { + cs.KEY_NODE_ID: 1, + cs.KEY_QUALIFIED_NAME: "myproject.module.hello", + } + query_ingestor.fetch_all.return_value = [row] + + updater_with_query._generate_semantic_embeddings() + + mock_embed.assert_not_called() + mock_store_batch.assert_not_called() + + @patch("codebase_rag.graph_updater.has_semantic_dependencies", return_value=True) + @patch("codebase_rag.embedder.embed_code", side_effect=RuntimeError("model error")) + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_handles_embed_failure_gracefully( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + _mock_embed: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + temp_repo: Path, + ) -> None: + (temp_repo / "module.py").write_text("def hello():\n return 42\n") + row: ResultRow = { + cs.KEY_NODE_ID: 1, + cs.KEY_QUALIFIED_NAME: "myproject.module.hello", + cs.KEY_START_LINE: 1, + cs.KEY_END_LINE: 2, + cs.KEY_PATH: "module.py", + } + query_ingestor.fetch_all.return_value = [row] + + updater_with_query._generate_semantic_embeddings() + + mock_store_batch.assert_not_called() + + @_PATCH_DEPS + @_PATCH_EMBED + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_skips_unparseable_rows( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + mock_embed: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + ) -> None: + bad_row: ResultRow = { + cs.KEY_NODE_ID: "not_an_int", + cs.KEY_QUALIFIED_NAME: "pkg.func", + } + query_ingestor.fetch_all.return_value = [bad_row] + + updater_with_query._generate_semantic_embeddings() + + mock_embed.assert_not_called() + mock_store_batch.assert_not_called() + + @_PATCH_DEPS + @_PATCH_EMBED + @_PATCH_STORE_BATCH + @_PATCH_RECONCILE + def test_counts_embedded_functions( + self, + _mock_reconcile: MagicMock, + mock_store_batch: MagicMock, + mock_embed: MagicMock, + _mock_deps: MagicMock, + updater_with_query: GraphUpdater, + query_ingestor: MagicMock, + temp_repo: Path, + ) -> None: + (temp_repo / "a.py").write_text("def f1():\n pass\n") + (temp_repo / "b.py").write_text("def f2():\n pass\n") + rows: list[ResultRow] = [ + { + cs.KEY_NODE_ID: 1, + cs.KEY_QUALIFIED_NAME: "proj.a.f1", + cs.KEY_START_LINE: 1, + cs.KEY_END_LINE: 2, + cs.KEY_PATH: "a.py", + }, + { + cs.KEY_NODE_ID: 2, + cs.KEY_QUALIFIED_NAME: "proj.b.f2", + cs.KEY_START_LINE: 1, + cs.KEY_END_LINE: 2, + cs.KEY_PATH: "b.py", + }, + ] + query_ingestor.fetch_all.return_value = rows + + updater_with_query._generate_semantic_embeddings() + + assert mock_embed.call_count == 2 + mock_store_batch.assert_called_once() + batch_arg = mock_store_batch.call_args[0][0] + assert len(batch_arg) == 2 diff --git a/codebase_rag/tests/test_graph_updater_incremental.py b/codebase_rag/tests/test_graph_updater_incremental.py new file mode 100644 index 000000000..1e0a16583 --- /dev/null +++ b/codebase_rag/tests/test_graph_updater_incremental.py @@ -0,0 +1,290 @@ +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import ( + BoundedASTCache, + FunctionRegistryTrie, + GraphUpdater, + _hash_file, + _load_hash_cache, + _save_hash_cache, +) +from codebase_rag.parser_loader import load_parsers + + +@pytest.fixture +def updater(temp_repo: Path, mock_ingestor: MagicMock) -> GraphUpdater: + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + +@pytest.fixture +def py_project(temp_repo: Path) -> Path: + (temp_repo / "__init__.py").touch() + (temp_repo / "module_a.py").write_text("def func_a():\n pass\n") + (temp_repo / "module_b.py").write_text("def func_b():\n pass\n") + return temp_repo + + +class TestHashFile: + def test_hash_returns_hex_string(self, temp_repo: Path) -> None: + f = temp_repo / "test.py" + f.write_text("hello") + result = _hash_file(f) + assert isinstance(result, str) + assert len(result) == 64 + + def test_same_content_same_hash(self, temp_repo: Path) -> None: + f1 = temp_repo / "a.py" + f2 = temp_repo / "b.py" + f1.write_text("same content") + f2.write_text("same content") + assert _hash_file(f1) == _hash_file(f2) + + def test_different_content_different_hash(self, temp_repo: Path) -> None: + f1 = temp_repo / "a.py" + f2 = temp_repo / "b.py" + f1.write_text("content one") + f2.write_text("content two") + assert _hash_file(f1) != _hash_file(f2) + + +class TestHashCacheIO: + def test_save_and_load_cache(self, temp_repo: Path) -> None: + cache_path = temp_repo / cs.HASH_CACHE_FILENAME + data = {"module_a.py": "abc123", "module_b.py": "def456"} + _save_hash_cache(cache_path, data) + + assert cache_path.is_file() + loaded = _load_hash_cache(cache_path) + assert loaded == data + + def test_load_nonexistent_returns_empty(self, temp_repo: Path) -> None: + cache_path = temp_repo / cs.HASH_CACHE_FILENAME + assert _load_hash_cache(cache_path) == {} + + def test_load_corrupted_returns_empty(self, temp_repo: Path) -> None: + cache_path = temp_repo / cs.HASH_CACHE_FILENAME + cache_path.write_text("not valid json {{{") + assert _load_hash_cache(cache_path) == {} + + def test_save_creates_parent_dirs(self, temp_repo: Path) -> None: + cache_path = temp_repo / "subdir" / "nested" / cs.HASH_CACHE_FILENAME + _save_hash_cache(cache_path, {"a.py": "hash1"}) + assert cache_path.is_file() + + def test_cache_file_is_valid_json(self, temp_repo: Path) -> None: + cache_path = temp_repo / cs.HASH_CACHE_FILENAME + data = {"file.py": "sha256hash"} + _save_hash_cache(cache_path, data) + with cache_path.open() as f: + parsed = json.load(f) + assert parsed == data + + +class TestIncrementalUpdates: + def test_unchanged_file_is_skipped( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + mock_ingestor.reset_mock() + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + with patch.object( + updater2, "_process_single_file", wraps=updater2._process_single_file + ) as spy: + updater2.run() + assert spy.call_count == 0 + + def test_changed_file_is_reparsed( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + (py_project / "module_a.py").write_text("def func_a_updated():\n pass\n") + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object( + updater2, "_process_single_file", wraps=updater2._process_single_file + ) as spy: + updater2.run() + processed_paths = [call.args[0] for call in spy.call_args_list] + assert py_project / "module_a.py" in processed_paths + + def test_deleted_file_removed_from_state( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + (py_project / "module_b.py").unlink() + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object( + updater2, "remove_file_from_state", wraps=updater2.remove_file_from_state + ) as spy: + updater2.run() + removed_paths = [call.args[0] for call in spy.call_args_list] + assert py_project / "module_b.py" in removed_paths + + def test_force_bypasses_cache( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object( + updater2, "_process_single_file", wraps=updater2._process_single_file + ) as spy: + updater2.run(force=True) + assert spy.call_count > 0 + + def test_new_file_is_processed( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + (py_project / "module_c.py").write_text("def func_c():\n pass\n") + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + with patch.object( + updater2, "_process_single_file", wraps=updater2._process_single_file + ) as spy: + updater2.run() + processed_paths = [call.args[0] for call in spy.call_args_list] + assert py_project / "module_c.py" in processed_paths + + def test_hash_cache_file_created_after_run( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + cache_path = py_project / cs.HASH_CACHE_FILENAME + assert not cache_path.exists() + + updater.run() + + assert cache_path.is_file() + with cache_path.open() as f: + data = json.load(f) + assert isinstance(data, dict) + assert len(data) > 0 + + def test_deleted_file_removed_from_hash_cache( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater.run() + + cache_path = py_project / cs.HASH_CACHE_FILENAME + with cache_path.open() as f: + old_data = json.load(f) + assert "module_b.py" in old_data + + (py_project / "module_b.py").unlink() + + updater2 = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + updater2.run() + + with cache_path.open() as f: + new_data = json.load(f) + assert "module_b.py" not in new_data + + +class TestSlots: + def test_function_registry_trie_has_slots(self) -> None: + assert hasattr(FunctionRegistryTrie, "__slots__") + trie = FunctionRegistryTrie() + with pytest.raises(AttributeError): + trie.nonexistent_attr = "value" # type: ignore[attr-defined] + + def test_bounded_ast_cache_has_slots(self) -> None: + assert hasattr(BoundedASTCache, "__slots__") + cache = BoundedASTCache() + with pytest.raises(AttributeError): + cache.nonexistent_attr = "value" # type: ignore[attr-defined] diff --git a/codebase_rag/tests/test_graph_updater_pruning.py b/codebase_rag/tests/test_graph_updater_pruning.py new file mode 100644 index 000000000..8657935b6 --- /dev/null +++ b/codebase_rag/tests/test_graph_updater_pruning.py @@ -0,0 +1,263 @@ +# (H) Tests for orphan node pruning in GraphUpdater._prune_orphan_nodes +# (H) and Cypher deletion in _process_files for hash-cache-detected deletions. +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers + + +@pytest.fixture +def updater(temp_repo: Path, mock_ingestor: MagicMock) -> GraphUpdater: + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=mock_ingestor, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + +@pytest.fixture +def py_project(temp_repo: Path) -> Path: + (temp_repo / "__init__.py").touch() + (temp_repo / "module_a.py").write_text("def func_a():\n pass\n") + (temp_repo / "module_b.py").write_text("def func_b():\n pass\n") + sub = temp_repo / "subpkg" + sub.mkdir() + (sub / "__init__.py").touch() + (sub / "inner.py").write_text("def inner_func():\n pass\n") + return temp_repo + + +class TestPruneOrphanNodes: + def test_prune_removes_orphan_module_nodes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + project_name = py_project.resolve().name + + mock_ingestor.fetch_all.side_effect = [ + [], + [ + { + "path": "old_project/main.py", + "qualified_name": f"{project_name}.old_project.main", + }, + { + "path": "module_a.py", + "qualified_name": f"{project_name}.module_a", + }, + ], + [], + ] + updater._prune_orphan_nodes() + + delete_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_MODULE + ] + assert len(delete_calls) == 1 + assert delete_calls[0].args[1] == {cs.KEY_PATH: "old_project/main.py"} + + def test_prune_skips_other_projects( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [ + [{"path": "app.py", "absolute_path": "/other/project/app.py"}], + [{"path": "app.py", "qualified_name": "other_project.app"}], + [{"path": "data", "absolute_path": "/other/project/data"}], + ] + updater._prune_orphan_nodes() + + assert mock_ingestor.execute_write.call_count == 0 + + def test_prune_no_orphans_skips_deletes( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + project_name = py_project.resolve().name + repo_abs = py_project.resolve().as_posix() + mock_ingestor.fetch_all.side_effect = [ + [{"path": "module_a.py", "absolute_path": f"{repo_abs}/module_a.py"}], + [{"path": "module_a.py", "qualified_name": f"{project_name}.module_a"}], + [{"path": "subpkg", "absolute_path": f"{repo_abs}/subpkg"}], + ] + updater._prune_orphan_nodes() + + assert mock_ingestor.execute_write.call_count == 0 + + def test_prune_handles_empty_graph( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + mock_ingestor.fetch_all.side_effect = [[], [], []] + updater._prune_orphan_nodes() + + assert mock_ingestor.execute_write.call_count == 0 + + def test_prune_handles_none_path_gracefully( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + project_name = py_project.resolve().name + mock_ingestor.fetch_all.side_effect = [ + [{"path": None, "absolute_path": None}], + [ + {"path": None, "qualified_name": f"{project_name}.something"}, + {"path": "module_a.py", "qualified_name": f"{project_name}.module_a"}, + ], + [], + ] + updater._prune_orphan_nodes() + + assert mock_ingestor.execute_write.call_count == 0 + + def test_prune_multiple_orphans_across_types( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + project_name = py_project.resolve().name + repo_abs = py_project.resolve().as_posix() + mock_ingestor.fetch_all.side_effect = [ + [ + {"path": "gone.py", "absolute_path": f"{repo_abs}/gone.py"}, + {"path": "module_a.py", "absolute_path": f"{repo_abs}/module_a.py"}, + ], + [ + { + "path": "deleted.py", + "qualified_name": f"{project_name}.deleted", + }, + { + "path": "module_a.py", + "qualified_name": f"{project_name}.module_a", + }, + ], + [ + {"path": "old_dir", "absolute_path": f"{repo_abs}/old_dir"}, + {"path": "subpkg", "absolute_path": f"{repo_abs}/subpkg"}, + ], + ] + updater._prune_orphan_nodes() + + assert mock_ingestor.execute_write.call_count == 3 + + +class TestDeletedFileInProcessFiles: + def test_deleted_file_triggers_cypher_delete( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + updater.run(force=True) + mock_ingestor.execute_write.reset_mock() + + (py_project / "module_b.py").unlink() + updater.run(force=False) + + delete_module_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_MODULE + ] + delete_file_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_FILE + ] + assert len(delete_module_calls) >= 1 + assert len(delete_file_calls) >= 1 + + def test_no_deletes_when_no_files_removed( + self, py_project: Path, mock_ingestor: MagicMock + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + updater.run(force=True) + mock_ingestor.execute_write.reset_mock() + + updater.run(force=False) + + delete_calls = [ + c + for c in mock_ingestor.execute_write.call_args_list + if c.args[0] in (cs.CYPHER_DELETE_MODULE, cs.CYPHER_DELETE_FILE) + ] + assert len(delete_calls) == 0 + + @patch("codebase_rag.graph_updater.GraphUpdater._prune_orphan_nodes") + def test_run_calls_prune( + self, + mock_prune: MagicMock, + py_project: Path, + mock_ingestor: MagicMock, + ) -> None: + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=py_project, + parsers=parsers, + queries=queries, + ) + + updater.run(force=True) + mock_prune.assert_called_once() diff --git a/codebase_rag/tests/test_handler_registry.py b/codebase_rag/tests/test_handler_registry.py index 2a9215755..6b7259f18 100644 --- a/codebase_rag/tests/test_handler_registry.py +++ b/codebase_rag/tests/test_handler_registry.py @@ -9,6 +9,7 @@ from codebase_rag.parsers.handlers.java import JavaHandler from codebase_rag.parsers.handlers.js_ts import JsTsHandler from codebase_rag.parsers.handlers.lua import LuaHandler +from codebase_rag.parsers.handlers.php import PhpHandler from codebase_rag.parsers.handlers.python import PythonHandler from codebase_rag.parsers.handlers.rust import RustHandler @@ -47,8 +48,12 @@ def test_returns_base_handler_for_go(self) -> None: assert isinstance(handler, BaseLanguageHandler) assert type(handler) is BaseLanguageHandler - def test_returns_base_handler_for_php(self) -> None: + def test_returns_php_handler_for_php(self) -> None: handler = get_handler(SupportedLanguage.PHP) + assert isinstance(handler, PhpHandler) + + def test_returns_base_handler_for_c(self) -> None: + handler = get_handler(SupportedLanguage.C) assert isinstance(handler, BaseLanguageHandler) assert type(handler) is BaseLanguageHandler @@ -84,6 +89,7 @@ class TestHandlerProtocol: SupportedLanguage.PYTHON, SupportedLanguage.GO, SupportedLanguage.PHP, + SupportedLanguage.C, ], ) def test_handler_has_all_protocol_methods( @@ -114,6 +120,8 @@ def test_handler_has_all_protocol_methods( SupportedLanguage.JAVA, SupportedLanguage.LUA, SupportedLanguage.PYTHON, + SupportedLanguage.PHP, + SupportedLanguage.C, ], ) def test_handler_methods_are_callable(self, language: SupportedLanguage) -> None: @@ -151,3 +159,6 @@ def test_lua_handler_extends_base(self) -> None: def test_python_handler_extends_base(self) -> None: assert issubclass(PythonHandler, BaseLanguageHandler) + + def test_php_handler_extends_base(self) -> None: + assert issubclass(PhpHandler, BaseLanguageHandler) diff --git a/codebase_rag/tests/test_handlers_unit.py b/codebase_rag/tests/test_handlers_unit.py index a9391ecde..f34d42d86 100644 --- a/codebase_rag/tests/test_handlers_unit.py +++ b/codebase_rag/tests/test_handlers_unit.py @@ -13,6 +13,7 @@ from codebase_rag.parsers.handlers.java import JavaHandler from codebase_rag.parsers.handlers.js_ts import JsTsHandler from codebase_rag.parsers.handlers.lua import LuaHandler +from codebase_rag.parsers.handlers.php import PhpHandler from codebase_rag.parsers.handlers.python import PythonHandler from codebase_rag.parsers.handlers.rust import RustHandler from codebase_rag.tests.conftest import create_mock_node @@ -62,6 +63,13 @@ except ImportError: LUA_AVAILABLE = False +try: + import tree_sitter_php as tsphp + + PHP_AVAILABLE = True +except ImportError: + PHP_AVAILABLE = False + @pytest.fixture def js_parser() -> Parser | None: @@ -111,6 +119,14 @@ def lua_parser() -> Parser | None: return Parser(language) +@pytest.fixture +def php_parser() -> Parser | None: + if not PHP_AVAILABLE: + return None + language = Language(tsphp.language_php()) + return Parser(language) + + class TestBaseLanguageHandler: def test_is_inside_method_with_object_literals_returns_false(self) -> None: handler = BaseLanguageHandler() @@ -1105,3 +1121,168 @@ def test_extract_decorators_dataclass_with_options( result = handler.extract_decorators(class_node) assert result == ["@dataclass(frozen=True, slots=True)"] + + +def _find_php_node(root: ASTNode, node_type: str) -> ASTNode | None: + if root.type == node_type: + return root + for child in root.children: + if result := _find_php_node(child, node_type): + return result + return None + + +@pytest.mark.skipif(not PHP_AVAILABLE, reason="tree-sitter-php not available") +class TestPhpHandler: + def test_extract_function_name_from_function_definition( + self, php_parser: Parser + ) -> None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" 2;" + tree = php_parser.parse(code) + arrow_node = _find_php_node(tree.root_node, cs.TS_PHP_ARROW_FUNCTION) + assert arrow_node is not None + + result = handler.extract_function_name(arrow_node) + assert result is not None + assert result.startswith("arrow_") + + def test_is_class_method_inside_class(self, php_parser: Parser) -> None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b' None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b" None: + handler = PhpHandler() + code = b' None: + (tmp_path / "utils").mkdir() + (tmp_path / "utils" / "__init__.py").touch() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + assert processor._is_local_module("utils") is True + assert processor._is_local_module("nonexistent") is False + + def test_is_local_module_cache_hits_on_repeated_calls(self, tmp_path: Path) -> None: + (tmp_path / "models").mkdir() + (tmp_path / "models" / "__init__.py").touch() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + processor._is_local_module("models") + processor._is_local_module("models") + processor._is_local_module("models") + + info = processor._is_local_module_cached.cache_info() + assert info.hits >= 2 + assert info.misses == 1 + + def test_is_local_module_detects_py_file(self, tmp_path: Path) -> None: + (tmp_path / "helpers.py").touch() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + assert processor._is_local_module("helpers") is True + + def test_is_local_module_detects_directory(self, tmp_path: Path) -> None: + (tmp_path / "services").mkdir() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + assert processor._is_local_module("services") is True + + def test_is_local_java_import_cache_hits(self, tmp_path: Path) -> None: + (tmp_path / "com").mkdir() + + processor = ImportProcessor( + repo_path=tmp_path, + project_name="myproject", + ingestor=None, + function_registry=None, + ) + + processor._is_local_java_import("com.example.Service") + processor._is_local_java_import("com.example.Service") + processor._is_local_java_import("com.example.Service") + + info = processor._is_local_java_import_cached.cache_info() + assert info.hits >= 2 + assert info.misses == 1 + + def test_separate_instances_have_independent_caches(self, tmp_path: Path) -> None: + (tmp_path / "shared").mkdir() + + p1 = ImportProcessor( + repo_path=tmp_path, + project_name="project1", + ingestor=None, + function_registry=None, + ) + p2 = ImportProcessor( + repo_path=tmp_path, + project_name="project2", + ingestor=None, + function_registry=None, + ) + + p1._is_local_module("shared") + p1._is_local_module("shared") + + info2 = p2._is_local_module_cached.cache_info() + assert info2.hits == 0 + assert info2.misses == 0 diff --git a/codebase_rag/tests/test_java_label_name_collision.py b/codebase_rag/tests/test_java_label_name_collision.py new file mode 100644 index 000000000..c43702119 --- /dev/null +++ b/codebase_rag/tests/test_java_label_name_collision.py @@ -0,0 +1,314 @@ +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.constants import NODE_UNIQUE_CONSTRAINTS, NodeLabel +from codebase_rag.tests.conftest import ( + get_node_names, + get_nodes, + get_qualified_names, + get_relationships, + run_updater, +) +from codebase_rag.types_defs import NodeType + + +@pytest.fixture +def java_label_collision_project(temp_repo: Path) -> Path: + project_path = temp_repo / "java_label_collision" + project_path.mkdir() + src = project_path / "src" / "main" / "java" / "com" / "example" + src.mkdir(parents=True) + return project_path + + +def _src_dir(project: Path) -> Path: + return project / "src" / "main" / "java" / "com" / "example" + + +def _has_qn_ending(qns: set[str], suffix: str) -> bool: + return any(qn.endswith(suffix) for qn in qns) + + +def test_interface_named_interface_ingested_as_interface_node( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Interface.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public interface Interface { + void doSomething(); +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + interface_nodes = get_nodes(mock_ingestor, NodeType.INTERFACE) + interface_qns = get_qualified_names(interface_nodes) + + assert _has_qn_ending(interface_qns, ".Interface"), ( + f"Interface named 'Interface' not found in Interface nodes. Got: {interface_qns}" + ) + + class_qns = get_node_names(mock_ingestor, NodeType.CLASS) + interface_in_class = [qn for qn in class_qns if qn.endswith(".Interface")] + assert not interface_in_class, ( + f"Interface named 'Interface' should not appear as a Class node. Got: {interface_in_class}" + ) + + +def test_enum_named_enum_ingested_as_enum_node( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Enum.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public enum Enum { + VALUE_A, + VALUE_B, + VALUE_C +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + enum_nodes = get_nodes(mock_ingestor, NodeType.ENUM) + enum_qns = get_qualified_names(enum_nodes) + + assert _has_qn_ending(enum_qns, ".Enum"), ( + f"Enum named 'Enum' not found in Enum nodes. Got: {enum_qns}" + ) + + class_qns = get_node_names(mock_ingestor, NodeType.CLASS) + enum_in_class = [qn for qn in class_qns if qn.endswith(".Enum")] + assert not enum_in_class, ( + f"Enum named 'Enum' should not appear as a Class node. Got: {enum_in_class}" + ) + + +def test_class_named_class_ingested_as_class_node( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Class.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Class { + public void run() {} +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + class_nodes = get_nodes(mock_ingestor, NodeType.CLASS) + class_qns = get_qualified_names(class_nodes) + + assert _has_qn_ending(class_qns, ".Class"), ( + f"Class named 'Class' not found in Class nodes. Got: {class_qns}" + ) + + +def test_interface_and_enum_labels_have_constraints() -> None: + assert NodeLabel.INTERFACE in NODE_UNIQUE_CONSTRAINTS, ( + "Interface label missing from NODE_UNIQUE_CONSTRAINTS" + ) + assert NodeLabel.ENUM in NODE_UNIQUE_CONSTRAINTS, ( + "Enum label missing from NODE_UNIQUE_CONSTRAINTS" + ) + assert NODE_UNIQUE_CONSTRAINTS[NodeLabel.INTERFACE] == "qualified_name" + assert NODE_UNIQUE_CONSTRAINTS[NodeLabel.ENUM] == "qualified_name" + + +def test_all_node_labels_have_constraints() -> None: + for label in NodeLabel: + assert label.value in NODE_UNIQUE_CONSTRAINTS, ( + f"NodeLabel.{label.name} ('{label.value}') missing from NODE_UNIQUE_CONSTRAINTS" + ) + + +def test_interface_named_interface_has_defines_relationship( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Interface.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public interface Interface { + void doSomething(); +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + defines_rels = get_relationships(mock_ingestor, "DEFINES") + found_defines = False + for rel in defines_rels: + if len(rel.args) >= 3: + to_spec = rel.args[2] + if isinstance(to_spec, tuple) and len(to_spec) >= 3: + to_label = to_spec[0] + to_qn = str(to_spec[2]) + if to_qn.endswith(".Interface"): + assert to_label == NodeType.INTERFACE, ( + f"DEFINES target label should be 'Interface', got '{to_label}'" + ) + found_defines = True + + assert found_defines, ( + "No DEFINES relationship found for Interface named 'Interface'" + ) + + +def test_enum_named_enum_has_defines_relationship( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Enum.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public enum Enum { + VALUE_A, + VALUE_B +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + defines_rels = get_relationships(mock_ingestor, "DEFINES") + found_defines = False + for rel in defines_rels: + if len(rel.args) >= 3: + to_spec = rel.args[2] + if isinstance(to_spec, tuple) and len(to_spec) >= 3: + to_label = to_spec[0] + to_qn = str(to_spec[2]) + if to_qn.endswith(".Enum"): + assert to_label == NodeType.ENUM, ( + f"DEFINES target label should be 'Enum', got '{to_label}'" + ) + found_defines = True + + assert found_defines, "No DEFINES relationship found for Enum named 'Enum'" + + +def test_class_implementing_interface_named_interface( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Interface.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public interface Interface { + void doSomething(); +} +""", + ) + (src / "Implementor.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Implementor implements Interface { + public void doSomething() { + System.out.println("done"); + } +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + interface_qns = get_node_names(mock_ingestor, NodeType.INTERFACE) + assert _has_qn_ending(interface_qns, ".Interface") + + class_qns = get_node_names(mock_ingestor, NodeType.CLASS) + assert _has_qn_ending(class_qns, ".Implementor") + + implements_rels = get_relationships(mock_ingestor, "IMPLEMENTS") + found_implements = False + for rel in implements_rels: + if len(rel.args) >= 3: + from_spec = rel.args[0] + if isinstance(from_spec, tuple) and len(from_spec) >= 3: + from_qn = str(from_spec[2]) + if from_qn.endswith(".Implementor"): + found_implements = True + + assert found_implements, ( + "No IMPLEMENTS relationship found for Implementor -> Interface" + ) + + +def test_multiple_label_colliding_names( + java_label_collision_project: Path, + mock_ingestor: MagicMock, +) -> None: + src = _src_dir(java_label_collision_project) + (src / "Function.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Function { + public void execute() {} +} +""", + ) + (src / "Method.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Method { + public void invoke() {} +} +""", + ) + (src / "Module.java").write_text( + encoding="utf-8", + data="""\ +package com.example; + +public class Module { + public void load() {} +} +""", + ) + run_updater(java_label_collision_project, mock_ingestor, skip_if_missing="java") + + class_qns = get_node_names(mock_ingestor, NodeType.CLASS) + assert _has_qn_ending(class_qns, ".Function") + assert _has_qn_ending(class_qns, ".Method") + assert _has_qn_ending(class_qns, ".Module") + + function_qns = get_node_names(mock_ingestor, NodeType.FUNCTION) + method_qns = get_node_names(mock_ingestor, NodeType.METHOD) + non_class_qns = function_qns | method_qns + collisions = [ + qn + for qn in non_class_qns + if qn.endswith(".Function") or qn.endswith(".Method") or qn.endswith(".Module") + ] + assert not collisions, ( + f"Class names colliding with node labels should not appear as wrong node types: {collisions}" + ) diff --git a/codebase_rag/tests/test_language_node_coverage.py b/codebase_rag/tests/test_language_node_coverage.py index 74648125f..4d902abda 100644 --- a/codebase_rag/tests/test_language_node_coverage.py +++ b/codebase_rag/tests/test_language_node_coverage.py @@ -3,6 +3,7 @@ import pytest from codebase_rag.constants import ( + C_EXTENSIONS, CPP_EXTENSIONS, CS_EXTENSIONS, GO_EXTENSIONS, @@ -60,6 +61,7 @@ def test_each_language_has_file_extensions(self, lang: SupportedLanguage) -> Non (SupportedLanguage.GO, GO_EXTENSIONS), (SupportedLanguage.SCALA, SCALA_EXTENSIONS), (SupportedLanguage.JAVA, JAVA_EXTENSIONS), + (SupportedLanguage.C, C_EXTENSIONS), (SupportedLanguage.CPP, CPP_EXTENSIONS), (SupportedLanguage.CSHARP, CS_EXTENSIONS), (SupportedLanguage.PHP, PHP_EXTENSIONS), @@ -87,6 +89,7 @@ def test_language_spec_has_correct_extensions( (".go", SupportedLanguage.GO), (".scala", SupportedLanguage.SCALA), (".java", SupportedLanguage.JAVA), + (".c", SupportedLanguage.C), (".cpp", SupportedLanguage.CPP), (".h", SupportedLanguage.CPP), (".hpp", SupportedLanguage.CPP), diff --git a/codebase_rag/tests/test_mcp_tools_helpers.py b/codebase_rag/tests/test_mcp_tools_helpers.py new file mode 100644 index 000000000..7804c9fa0 --- /dev/null +++ b/codebase_rag/tests/test_mcp_tools_helpers.py @@ -0,0 +1,98 @@ +from unittest.mock import MagicMock, patch + +from codebase_rag import constants as cs + +_PATCH_DELETE = "codebase_rag.mcp.tools.delete_project_embeddings" + + +def _make_registry(mock_ingestor: MagicMock) -> MagicMock: + from codebase_rag.mcp.tools import MCPToolsRegistry + + registry = MagicMock(spec=MCPToolsRegistry) + registry.ingestor = mock_ingestor + registry._get_project_node_ids = MCPToolsRegistry._get_project_node_ids.__get__( + registry + ) + registry._cleanup_project_embeddings = ( + MCPToolsRegistry._cleanup_project_embeddings.__get__(registry) + ) + return registry + + +class TestGetProjectNodeIds: + def test_returns_integer_ids(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [ + {cs.KEY_NODE_ID: 1}, + {cs.KEY_NODE_ID: 2}, + {cs.KEY_NODE_ID: 3}, + ] + registry = _make_registry(mock_ingestor) + + result = registry._get_project_node_ids("myproject") + + assert result == [1, 2, 3] + mock_ingestor.fetch_all.assert_called_once_with( + cs.CYPHER_QUERY_PROJECT_NODE_IDS, + {cs.KEY_PROJECT_NAME: "myproject"}, + ) + + def test_filters_non_integer_ids(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [ + {cs.KEY_NODE_ID: 1}, + {cs.KEY_NODE_ID: "not_an_int"}, + {cs.KEY_NODE_ID: None}, + {cs.KEY_NODE_ID: 4}, + ] + registry = _make_registry(mock_ingestor) + + result = registry._get_project_node_ids("proj") + + assert result == [1, 4] + + def test_returns_empty_when_no_rows(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [] + registry = _make_registry(mock_ingestor) + + result = registry._get_project_node_ids("empty") + + assert result == [] + + def test_skips_rows_missing_key(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [ + {"other_key": 99}, + {cs.KEY_NODE_ID: 5}, + ] + registry = _make_registry(mock_ingestor) + + result = registry._get_project_node_ids("proj") + + assert result == [5] + + +class TestCleanupProjectEmbeddings: + def test_calls_delete_with_node_ids(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [ + {cs.KEY_NODE_ID: 10}, + {cs.KEY_NODE_ID: 20}, + ] + registry = _make_registry(mock_ingestor) + + with patch(_PATCH_DELETE) as mock_delete: + registry._cleanup_project_embeddings("myproject") + + mock_delete.assert_called_once_with("myproject", [10, 20]) + + def test_calls_delete_with_empty_list_when_no_nodes(self) -> None: + mock_ingestor = MagicMock() + mock_ingestor.fetch_all.return_value = [] + registry = _make_registry(mock_ingestor) + + with patch(_PATCH_DELETE) as mock_delete: + registry._cleanup_project_embeddings("empty_proj") + + mock_delete.assert_called_once_with("empty_proj", []) diff --git a/codebase_rag/tests/test_mcp_update_and_search.py b/codebase_rag/tests/test_mcp_update_and_search.py new file mode 100644 index 000000000..a55090ccb --- /dev/null +++ b/codebase_rag/tests/test_mcp_update_and_search.py @@ -0,0 +1,496 @@ +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.mcp.client import query_mcp_server +from codebase_rag.mcp.tools import MCPToolsRegistry + +pytestmark = [pytest.mark.anyio] + + +@pytest.fixture(params=["asyncio"]) +def anyio_backend(request: pytest.FixtureRequest) -> str: + return str(request.param) + + +@pytest.fixture +def temp_project_root(tmp_path: Path) -> Path: + sample_file = tmp_path / "app.py" + sample_file.write_text("def main(): pass\n", encoding="utf-8") + return tmp_path + + +@pytest.fixture +def mcp_registry(temp_project_root: Path) -> MCPToolsRegistry: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + return registry + + +class TestUpdateRepository: + async def test_update_repository_success( + self, mcp_registry: MCPToolsRegistry + ) -> None: + with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_cls: + mock_updater = MagicMock() + mock_updater_cls.return_value = mock_updater + + result = await mcp_registry.update_repository() + + mock_updater_cls.assert_called_once() + mock_updater.run.assert_called_once() + assert mcp_registry.project_root in result + + async def test_update_repository_error( + self, mcp_registry: MCPToolsRegistry + ) -> None: + with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_cls: + mock_updater_cls.side_effect = RuntimeError("parse error") + + result = await mcp_registry.update_repository() + + assert "Error" in result + + async def test_update_repository_registered( + self, mcp_registry: MCPToolsRegistry + ) -> None: + assert cs.MCPToolName.UPDATE_REPOSITORY in mcp_registry._tools + + async def test_update_repository_no_wipe( + self, mcp_registry: MCPToolsRegistry + ) -> None: + with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_cls: + mock_updater = MagicMock() + mock_updater_cls.return_value = mock_updater + + await mcp_registry.update_repository() + + mcp_registry.ingestor.delete_project.assert_not_called() + mcp_registry.ingestor.clean_database.assert_not_called() + + +class TestSemanticSearchRegistration: + def test_semantic_search_not_registered_without_deps( + self, temp_project_root: Path + ) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=False, + ): + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + assert cs.MCPToolName.SEMANTIC_SEARCH not in registry._tools + assert registry._semantic_search_available is False + + def test_semantic_search_registered_with_deps( + self, temp_project_root: Path + ) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with ( + patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=True, + ), + patch( + "codebase_rag.tools.semantic_search.create_semantic_search_tool" + ) as mock_create, + ): + mock_tool = MagicMock() + mock_create.return_value = mock_tool + + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + assert cs.MCPToolName.SEMANTIC_SEARCH in registry._tools + assert registry._semantic_search_available is True + + async def test_semantic_search_calls_tool(self, temp_project_root: Path) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with ( + patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=True, + ), + patch( + "codebase_rag.tools.semantic_search.create_semantic_search_tool" + ) as mock_create, + ): + mock_tool = MagicMock() + mock_tool.function = AsyncMock(return_value="result1, result2") + mock_create.return_value = mock_tool + + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + result = await registry.semantic_search("find auth functions", top_k=3) + + mock_tool.function.assert_called_once_with( + query="find auth functions", top_k=3 + ) + assert "result1" in result + + +class TestAskAgent: + async def test_ask_agent_registered(self, mcp_registry: MCPToolsRegistry) -> None: + assert cs.MCPToolName.ASK_AGENT in mcp_registry._tools + + async def test_ask_agent_success(self, mcp_registry: MCPToolsRegistry) -> None: + mock_agent = MagicMock() + mock_response = MagicMock() + mock_response.output = "The auth module uses JWT tokens." + mock_agent.run = AsyncMock(return_value=mock_response) + mcp_registry.rag_agent = mock_agent + + result = await mcp_registry.ask_agent("How is auth implemented?") + + assert result["output"] == "The auth module uses JWT tokens." + mock_agent.run.assert_called_once_with( + "How is auth implemented?", message_history=[] + ) + + async def test_ask_agent_error(self, mcp_registry: MCPToolsRegistry) -> None: + mock_agent = MagicMock() + mock_agent.run = AsyncMock(side_effect=RuntimeError("LLM unavailable")) + mcp_registry.rag_agent = mock_agent + + result = await mcp_registry.ask_agent("What does main do?") + + assert "error" in result + + +class TestToolDescriptions: + def test_update_repository_in_tool_map(self) -> None: + from codebase_rag.tools.tool_descriptions import MCP_TOOLS + + assert cs.MCPToolName.UPDATE_REPOSITORY in MCP_TOOLS + + def test_semantic_search_in_tool_map(self) -> None: + from codebase_rag.tools.tool_descriptions import MCP_TOOLS + + assert cs.MCPToolName.SEMANTIC_SEARCH in MCP_TOOLS + + def test_ask_agent_in_tool_map(self) -> None: + from codebase_rag.tools.tool_descriptions import MCP_TOOLS + + assert cs.MCPToolName.ASK_AGENT in MCP_TOOLS + + def test_index_repository_warns_about_project_clear(self) -> None: + from codebase_rag.tools.tool_descriptions import MCP_INDEX_REPOSITORY + + assert "current project" in MCP_INDEX_REPOSITORY + assert "entire database" not in MCP_INDEX_REPOSITORY + + +class TestRagAgentProperty: + def test_rag_agent_setter_allows_mock(self, mcp_registry: MCPToolsRegistry) -> None: + mock_agent = MagicMock() + mcp_registry.rag_agent = mock_agent + assert mcp_registry.rag_agent is mock_agent + + def test_rag_agent_lazy_init(self, temp_project_root: Path) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=False, + ): + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + assert registry._rag_agent is None + + with patch("codebase_rag.mcp.tools.create_rag_orchestrator") as mock_create: + mock_agent = MagicMock() + mock_create.return_value = mock_agent + + agent = registry.rag_agent + + mock_create.assert_called_once() + assert agent is mock_agent + + def test_rag_agent_includes_function_source_tool( + self, temp_project_root: Path + ) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=False, + ): + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + with ( + patch("codebase_rag.mcp.tools.create_rag_orchestrator") as mock_create, + patch( + "codebase_rag.tools.semantic_search.create_get_function_source_tool" + ) as mock_fst, + ): + mock_tool = MagicMock() + mock_fst.return_value = mock_tool + mock_create.return_value = MagicMock() + + registry.rag_agent + + tools_arg = mock_create.call_args[1]["tools"] + assert mock_tool in tools_arg + + def test_rag_agent_includes_semantic_search_when_available( + self, temp_project_root: Path + ) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with ( + patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=True, + ), + patch( + "codebase_rag.tools.semantic_search.create_semantic_search_tool" + ) as mock_ss, + ): + mock_ss_tool = MagicMock() + mock_ss.return_value = mock_ss_tool + + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + with ( + patch("codebase_rag.mcp.tools.create_rag_orchestrator") as mock_create, + patch("codebase_rag.tools.semantic_search.create_get_function_source_tool"), + ): + mock_create.return_value = MagicMock() + registry.rag_agent + + tools_arg = mock_create.call_args[1]["tools"] + assert mock_ss_tool in tools_arg + + def test_rag_agent_caches_after_first_access(self, temp_project_root: Path) -> None: + mock_ingestor = MagicMock() + mock_cypher_gen = MagicMock() + + with patch( + "codebase_rag.mcp.tools.has_semantic_dependencies", + return_value=False, + ): + registry = MCPToolsRegistry( + project_root=str(temp_project_root), + ingestor=mock_ingestor, + cypher_gen=mock_cypher_gen, + ) + + with ( + patch("codebase_rag.mcp.tools.create_rag_orchestrator") as mock_create, + patch("codebase_rag.tools.semantic_search.create_get_function_source_tool"), + ): + mock_create.return_value = MagicMock() + + agent1 = registry.rag_agent + agent2 = registry.rag_agent + + mock_create.assert_called_once() + assert agent1 is agent2 + + +class TestMainSingleQuery: + def test_main_single_query_prints_output( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + from codebase_rag.main import main_single_query + + mock_response = MagicMock() + mock_response.output = "The answer is 42." + + with ( + patch("codebase_rag.main.connect_memgraph") as mock_conn, + patch("codebase_rag.main._initialize_services_and_agent") as mock_init, + patch("codebase_rag.main.asyncio") as mock_asyncio, + patch("codebase_rag.main._setup_common_initialization"), + ): + mock_agent = MagicMock() + mock_init.return_value = (mock_agent, []) + mock_asyncio.run.return_value = mock_response + mock_conn.return_value.__enter__ = MagicMock(return_value=MagicMock()) + mock_conn.return_value.__exit__ = MagicMock(return_value=False) + + main_single_query(str(tmp_path), 1000, "What is the answer?") + + captured = capsys.readouterr() + assert "The answer is 42." in captured.out + + def test_main_single_query_routes_logs_to_stderr(self, tmp_path: Path) -> None: + from codebase_rag.main import main_single_query + + mock_response = MagicMock() + mock_response.output = "result" + + with ( + patch("codebase_rag.main.connect_memgraph") as mock_conn, + patch("codebase_rag.main._initialize_services_and_agent") as mock_init, + patch("codebase_rag.main.asyncio") as mock_asyncio, + patch("codebase_rag.main._setup_common_initialization"), + patch("codebase_rag.main.logger") as mock_logger, + ): + mock_agent = MagicMock() + mock_init.return_value = (mock_agent, []) + mock_asyncio.run.return_value = mock_response + mock_conn.return_value.__enter__ = MagicMock(return_value=MagicMock()) + mock_conn.return_value.__exit__ = MagicMock(return_value=False) + + main_single_query(str(tmp_path), 1000, "test") + + mock_logger.remove.assert_called_once() + mock_logger.add.assert_called_once() + add_args = mock_logger.add.call_args + import sys + + assert add_args[0][0] is sys.stderr + + +class TestMCPClient: + def test_query_mcp_server_is_callable(self) -> None: + assert callable(query_mcp_server) + + def test_client_uses_constants(self) -> None: + import inspect + + from codebase_rag.mcp import client + + source = inspect.getsource(client) + assert "MCPToolName.ASK_AGENT" in source + assert "MCPParamName.QUESTION" in source + + def test_query_with_errlog_is_async(self) -> None: + import asyncio + + from codebase_rag.mcp.client import _query_with_errlog + + assert asyncio.iscoroutinefunction(_query_with_errlog) + + async def test_query_with_errlog_json_response(self) -> None: + import io + + from codebase_rag.mcp.client import _query_with_errlog + + mock_content = MagicMock() + mock_content.text = '{"output": "test answer"}' + mock_result = MagicMock() + mock_result.content = [mock_content] + + mock_session = AsyncMock() + mock_session.initialize = AsyncMock() + mock_session.call_tool = AsyncMock(return_value=mock_result) + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + mock_transport = AsyncMock() + mock_transport.__aenter__ = AsyncMock(return_value=(MagicMock(), MagicMock())) + mock_transport.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("codebase_rag.mcp.client.stdio_client", return_value=mock_transport), + patch("codebase_rag.mcp.client.ClientSession", return_value=mock_session), + ): + result = await _query_with_errlog("test question", io.StringIO()) + + assert result == {"output": "test answer"} + + async def test_query_with_errlog_non_json_response(self) -> None: + import io + + from codebase_rag.mcp.client import _query_with_errlog + + mock_content = MagicMock() + mock_content.text = "plain text response" + mock_result = MagicMock() + mock_result.content = [mock_content] + + mock_session = AsyncMock() + mock_session.initialize = AsyncMock() + mock_session.call_tool = AsyncMock(return_value=mock_result) + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + mock_transport = AsyncMock() + mock_transport.__aenter__ = AsyncMock(return_value=(MagicMock(), MagicMock())) + mock_transport.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("codebase_rag.mcp.client.stdio_client", return_value=mock_transport), + patch("codebase_rag.mcp.client.ClientSession", return_value=mock_session), + ): + result = await _query_with_errlog("test", io.StringIO()) + + assert result == {"output": "plain text response"} + + async def test_query_with_errlog_empty_response(self) -> None: + import io + + from codebase_rag.mcp.client import _query_with_errlog + + mock_result = MagicMock() + mock_result.content = [] + + mock_session = AsyncMock() + mock_session.initialize = AsyncMock() + mock_session.call_tool = AsyncMock(return_value=mock_result) + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + mock_transport = AsyncMock() + mock_transport.__aenter__ = AsyncMock(return_value=(MagicMock(), MagicMock())) + mock_transport.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("codebase_rag.mcp.client.stdio_client", return_value=mock_transport), + patch("codebase_rag.mcp.client.ClientSession", return_value=mock_session), + ): + result = await _query_with_errlog("test", io.StringIO()) + + assert result == {"output": "No response from server"} + + def test_query_mcp_server_opens_devnull(self) -> None: + with ( + patch("codebase_rag.mcp.client.asyncio") as mock_asyncio, + patch("builtins.open", MagicMock()) as mock_open, + ): + mock_asyncio.run.return_value = {"output": "result"} + query_mcp_server("test") + mock_open.assert_called_once() diff --git a/codebase_rag/tests/test_mcp_write_file.py b/codebase_rag/tests/test_mcp_write_file.py index 6c214c12a..dd222e9c6 100644 --- a/codebase_rag/tests/test_mcp_write_file.py +++ b/codebase_rag/tests/test_mcp_write_file.py @@ -199,6 +199,10 @@ class TestWriteFileErrorHandling: @pytest.mark.skipif( os.name == "nt", reason="chmod 0o444 does not prevent file creation on Windows" ) + @pytest.mark.skipif( + hasattr(os, "getuid") and os.getuid() == 0, + reason="root bypasses filesystem permissions", + ) async def test_write_to_readonly_directory( self, mcp_registry: MCPToolsRegistry, temp_project_root: Path ) -> None: diff --git a/codebase_rag/tests/test_memgraph_batching.py b/codebase_rag/tests/test_memgraph_batching.py index a3297e819..81c068b66 100644 --- a/codebase_rag/tests/test_memgraph_batching.py +++ b/codebase_rag/tests/test_memgraph_batching.py @@ -64,15 +64,20 @@ def test_node_batch_preserves_per_row_properties() -> None: def test_relationship_batch_flushes_after_threshold_and_respects_node_flush() -> None: ingestor, cursor_mock = _create_ingestor_with_mocked_connection() + col = MagicMock() + col.name = "created" + cursor_mock.description = [col] + cursor_mock.fetchall.return_value = [(1,), (1,)] + with patch.object( - ingestor, "flush_nodes", wraps=ingestor.flush_nodes + MemgraphIngestor, "flush_nodes", wraps=ingestor.flush_nodes ) as flush_nodes_spy: ingestor.ensure_relationship_batch( ("Module", "qualified_name", "proj.module1"), "CONTAINS_FILE", ("File", "path", "file1"), ) - assert len(ingestor.relationship_buffer) == 1 + assert ingestor._rel_count == 1 cursor_mock.execute.assert_not_called() ingestor.ensure_relationship_batch( @@ -83,7 +88,7 @@ def test_relationship_batch_flushes_after_threshold_and_respects_node_flush() -> assert flush_nodes_spy.call_count == 1 - assert len(ingestor.relationship_buffer) == 0 + assert ingestor._rel_count == 0 cursor_mock.execute.assert_called_once() executed_query = cursor_mock.execute.call_args[0][0] assert "UNWIND $batch" in executed_query diff --git a/codebase_rag/tests/test_method_calls_caller_attribution.py b/codebase_rag/tests/test_method_calls_caller_attribution.py new file mode 100644 index 000000000..6c4cd2a01 --- /dev/null +++ b/codebase_rag/tests/test_method_calls_caller_attribution.py @@ -0,0 +1,679 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING +from unittest.mock import MagicMock + +from codebase_rag import constants as cs +from codebase_rag.tests.conftest import get_relationships, run_updater + +if TYPE_CHECKING: + pass + + +def _get_method_caller_calls(mock_ingestor: MagicMock) -> list: + return [ + c + for c in get_relationships(mock_ingestor, cs.RelationshipType.CALLS) + if c.args[0][0] == cs.NodeLabel.METHOD + ] + + +def _get_function_caller_calls(mock_ingestor: MagicMock) -> list: + return [ + c + for c in get_relationships(mock_ingestor, cs.RelationshipType.CALLS) + if c.args[0][0] == cs.NodeLabel.FUNCTION + ] + + +def _get_module_caller_calls(mock_ingestor: MagicMock) -> list: + return [ + c + for c in get_relationships(mock_ingestor, cs.RelationshipType.CALLS) + if c.args[0][0] == cs.NodeLabel.MODULE + ] + + +def _caller_qn(call: MagicMock) -> str: + return call.args[0][2] + + +def _callee_qn(call: MagicMock) -> str: + return call.args[2][2] + + +class TestCppMethodCallerAttribution: + def test_simple_class_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "player.cpp").write_text( + encoding="utf-8", + data=""" +class Player { +public: + void handleArtifact() {} + + void handleArtifactWatcherCb() { + handleArtifact(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + callees = [_callee_qn(c) for c in method_calls] + + watcher_callers = [qn for qn in callers if "handleArtifactWatcherCb" in qn] + assert len(watcher_callers) >= 1 + + artifact_callees = [qn for qn in callees if "handleArtifact" in qn] + assert len(artifact_callees) >= 1 + + def test_struct_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "sensor.cpp").write_text( + encoding="utf-8", + data=""" +struct Sensor { + int readRaw() { return 42; } + + int readCalibrated() { + return readRaw() * 2; + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + callees = [_callee_qn(c) for c in method_calls] + + assert any("readCalibrated" in qn for qn in callers) + assert any("readRaw" in qn for qn in callees) + + def test_multiple_methods_calling_each_other( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "calc.cpp").write_text( + encoding="utf-8", + data=""" +class Calculator { +public: + int add(int a, int b) { return a + b; } + int multiply(int a, int b) { return a * b; } + + int compute(int x) { + int sum = add(x, 1); + return multiply(sum, 2); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + compute_calls = [c for c in method_calls if "compute" in _caller_qn(c)] + compute_callees = {_callee_qn(c) for c in compute_calls} + + assert any("add" in qn for qn in compute_callees) + assert any("multiply" in qn for qn in compute_callees) + + def test_constructor_body_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "init.cpp").write_text( + encoding="utf-8", + data=""" +class Engine { +public: + void initialize() {} + + Engine() { + initialize(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callees = [_callee_qn(c) for c in method_calls] + assert any("initialize" in qn for qn in callees) + + def test_method_calling_free_function_has_method_caller( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "mixed.cpp").write_text( + encoding="utf-8", + data=""" +void freeHelper() {} + +class Service { +public: + void process() { + freeHelper(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + process_calls = [c for c in method_calls if "process" in _caller_qn(c)] + assert len(process_calls) >= 1 + + def test_multiple_classes_in_one_file( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "multi.cpp").write_text( + encoding="utf-8", + data=""" +class Alpha { +public: + void step1() {} + void run() { step1(); } +}; + +class Beta { +public: + void step2() {} + void execute() { step2(); } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = {_caller_qn(c) for c in method_calls} + callees = {_callee_qn(c) for c in method_calls} + + assert any("run" in qn for qn in callers) + assert any("execute" in qn for qn in callers) + assert any("step1" in qn for qn in callees) + assert any("step2" in qn for qn in callees) + + def test_method_with_parameters( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "params.cpp").write_text( + encoding="utf-8", + data=""" +class Parser { +public: + int parse(const char* input, int length) { return 0; } + + int parseFile(const char* path) { + return parse(path, 100); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("parseFile" in qn for qn in callers) + + def test_virtual_method_calls( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "virtual.cpp").write_text( + encoding="utf-8", + data=""" +class Base { +public: + virtual void onEvent() {} + + void dispatch() { + onEvent(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + dispatch_calls = [c for c in method_calls if "dispatch" in _caller_qn(c)] + assert len(dispatch_calls) >= 1 + assert any("onEvent" in _callee_qn(c) for c in dispatch_calls) + + def test_method_calling_another_via_this_pointer( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "this_ptr.cpp").write_text( + encoding="utf-8", + data=""" +class Widget { +public: + void repaint() {} + + void resize(int w, int h) { + this->repaint(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("resize" in qn for qn in callers) + + def test_deeply_nested_call_chain( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "chain.cpp").write_text( + encoding="utf-8", + data=""" +class Pipeline { +public: + int validate() { return 1; } + int transform(int x) { return x * 2; } + int output(int x) { return x; } + + int run() { + int v = validate(); + int t = transform(v); + return output(t); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + run_calls = [c for c in method_calls if "run" in _caller_qn(c)] + run_callees = {_callee_qn(c) for c in run_calls} + + assert any("validate" in qn for qn in run_callees) + assert any("transform" in qn for qn in run_callees) + assert any("output" in qn for qn in run_callees) + + def test_static_method_calls( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "static.cpp").write_text( + encoding="utf-8", + data=""" +class Factory { +public: + static int create() { return 0; } + + static int build() { + return create(); + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("build" in qn for qn in callers) + + def test_const_method_calls( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "const.cpp").write_text( + encoding="utf-8", + data=""" +class Container { +public: + int size() const { return 10; } + + bool empty() const { + return size() == 0; + } +}; +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.CPP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("empty" in qn for qn in callers) + + +class TestPythonMethodCallerAttribution: + def test_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "service.py").write_text( + encoding="utf-8", + data=""" +class Service: + def validate(self): + pass + + def process(self): + self.validate() +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PYTHON) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("process" in qn for qn in callers) + + def test_multiple_methods_calling_each_other( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "pipeline.py").write_text( + encoding="utf-8", + data=""" +class Pipeline: + def step1(self): + pass + + def step2(self): + self.step1() + + def run(self): + self.step2() +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PYTHON) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = {_caller_qn(c) for c in method_calls} + assert any("step2" in qn for qn in callers) + assert any("run" in qn for qn in callers) + + def test_dunder_init_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "init.py").write_text( + encoding="utf-8", + data=""" +class Config: + def _load(self): + pass + + def __init__(self): + self._load() +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PYTHON) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("__init__" in qn for qn in callers) + + +class TestJavaScriptMethodCallerAttribution: + def test_class_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "service.js").write_text( + encoding="utf-8", + data=""" +class Service { + validate() { + return true; + } + + process() { + return this.validate(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JS) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("process" in qn for qn in callers) + + def test_constructor_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "widget.js").write_text( + encoding="utf-8", + data=""" +class Widget { + setup() {} + + constructor() { + this.setup(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JS) + + method_calls = _get_method_caller_calls(mock_ingestor) + callees = [_callee_qn(c) for c in method_calls] + assert any("setup" in qn for qn in callees) + + +class TestTypeScriptMethodCallerAttribution: + def test_class_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "handler.ts").write_text( + encoding="utf-8", + data=""" +class Handler { + private validate(): boolean { + return true; + } + + public handle(): void { + this.validate(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.TS) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("handle" in qn for qn in callers) + + def test_multiple_methods_with_types( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "repo.ts").write_text( + encoding="utf-8", + data=""" +class Repository { + find(id: number): string { return ""; } + validate(data: string): boolean { return true; } + + save(id: number): boolean { + const item = this.find(id); + return this.validate(item); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.TS) + + method_calls = _get_method_caller_calls(mock_ingestor) + save_calls = [c for c in method_calls if "save" in _caller_qn(c)] + save_callees = {_callee_qn(c) for c in save_calls} + assert any("find" in qn for qn in save_callees) + assert any("validate" in qn for qn in save_callees) + + +class TestJavaMethodCallerAttribution: + def test_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "Service.java").write_text( + encoding="utf-8", + data=""" +public class Service { + private boolean validate() { + return true; + } + + public void process() { + validate(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JAVA) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("process" in qn for qn in callers) + + def test_constructor_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "Config.java").write_text( + encoding="utf-8", + data=""" +public class Config { + private void loadDefaults() {} + + public Config() { + loadDefaults(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JAVA) + + method_calls = _get_method_caller_calls(mock_ingestor) + callees = [_callee_qn(c) for c in method_calls] + assert any("loadDefaults" in qn for qn in callees) + + def test_multiple_methods_calling_each_other( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "Calculator.java").write_text( + encoding="utf-8", + data=""" +public class Calculator { + public int add(int a, int b) { return a + b; } + public int multiply(int a, int b) { return a * b; } + + public int compute(int x) { + int sum = add(x, 1); + return multiply(sum, 2); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.JAVA) + + method_calls = _get_method_caller_calls(mock_ingestor) + compute_calls = [c for c in method_calls if "compute" in _caller_qn(c)] + compute_callees = {_callee_qn(c) for c in compute_calls} + assert any("add" in qn for qn in compute_callees) + assert any("multiply" in qn for qn in compute_callees) + + +class TestRustMethodCallerAttribution: + def test_impl_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "lib.rs").write_text( + encoding="utf-8", + data=""" +struct Player { + health: i32, +} + +impl Player { + fn heal(&mut self) { + self.health += 10; + } + + fn take_damage(&mut self, amount: i32) { + self.health -= amount; + self.heal(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.RUST) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("take_damage" in qn for qn in callers) + + def test_multiple_impl_methods( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "lib.rs").write_text( + encoding="utf-8", + data=""" +struct Pipeline; + +impl Pipeline { + fn validate(&self) -> bool { true } + fn transform(&self, x: i32) -> i32 { x * 2 } + + fn run(&self, input: i32) -> i32 { + if self.validate() { + self.transform(input) + } else { + 0 + } + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.RUST) + + method_calls = _get_method_caller_calls(mock_ingestor) + run_calls = [c for c in method_calls if "run" in _caller_qn(c)] + assert len(run_calls) >= 1 + + +class TestPhpMethodCallerAttribution: + def test_method_calls_method( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "service.php").write_text( + encoding="utf-8", + data="""validate(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PHP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = [_caller_qn(c) for c in method_calls] + assert any("process" in qn for qn in callers) + + def test_multiple_methods_calling_each_other( + self, temp_repo: Path, mock_ingestor: MagicMock + ) -> None: + (temp_repo / "pipeline.php").write_text( + encoding="utf-8", + data="""step1(); + } + + public function run() { + $this->step2(); + } +} +""", + ) + run_updater(temp_repo, mock_ingestor, cs.SupportedLanguage.PHP) + + method_calls = _get_method_caller_calls(mock_ingestor) + callers = {_caller_qn(c) for c in method_calls} + assert any("step2" in qn for qn in callers) + assert any("run" in qn for qn in callers) diff --git a/codebase_rag/tests/test_node_relationship_coverage.py b/codebase_rag/tests/test_node_relationship_coverage.py index e6af5fd05..00389af7a 100644 --- a/codebase_rag/tests/test_node_relationship_coverage.py +++ b/codebase_rag/tests/test_node_relationship_coverage.py @@ -136,18 +136,15 @@ def test_each_relationship_type_can_be_flushed( ingestor.conn = mock_conn - ingestor.relationship_buffer.append( - ( - (NodeLabel.MODULE.value, KEY_QUALIFIED_NAME, "module.test"), - rel_type.value, - (NodeLabel.FUNCTION.value, KEY_QUALIFIED_NAME, "module.test.func"), - None, - ) + ingestor.ensure_relationship_batch( + (NodeLabel.MODULE.value, KEY_QUALIFIED_NAME, "module.test"), + rel_type.value, + (NodeLabel.FUNCTION.value, KEY_QUALIFIED_NAME, "module.test.func"), ) ingestor.flush_relationships() mock_cursor.execute.assert_called_once() - assert ingestor.relationship_buffer == [] + assert ingestor._rel_count == 0 class TestUniqueKeyPropertyNames: @@ -230,10 +227,13 @@ def test_ensure_constraints_creates_all_constraints(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) executed_queries: list[str] = [] - def capture_query(query: str) -> None: + def capture_query(query: str, params: object = None) -> list[object]: executed_queries.append(query) + return [] - with patch.object(ingestor, "_execute_query", side_effect=capture_query): + with patch.object( + MemgraphIngestor, "_execute_query", side_effect=capture_query + ): ingestor.ensure_constraints() for label in NodeLabel: @@ -249,10 +249,13 @@ def test_ensure_constraints_creates_all_indexes(self) -> None: ingestor = MemgraphIngestor(host="localhost", port=7687) executed_queries: list[str] = [] - def capture_query(query: str) -> None: + def capture_query(query: str, params: object = None) -> list[object]: executed_queries.append(query) + return [] - with patch.object(ingestor, "_execute_query", side_effect=capture_query): + with patch.object( + MemgraphIngestor, "_execute_query", side_effect=capture_query + ): ingestor.ensure_constraints() for label in NodeLabel: diff --git a/codebase_rag/tests/test_php_functions.py b/codebase_rag/tests/test_php_functions.py new file mode 100644 index 000000000..992d5c900 --- /dev/null +++ b/codebase_rag/tests/test_php_functions.py @@ -0,0 +1,153 @@ +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.tests.conftest import get_relationships +from codebase_rag.types_defs import NodeType + + +def test_php_function_discovery(temp_repo: Path, mock_ingestor: MagicMock) -> None: + project_path = temp_repo / "php_functions_test" + project_path.mkdir() + + (project_path / "example.php").write_text( + encoding="utf-8", + data="""value = 0; + } + + public function getValue() { + return $this->value; + } +} + +interface MyInterface { + public function doSomething(); +} + +enum Status { + case Active; + case Inactive; +} + +function standaloneFunction() { + $obj = new MyPhpClass(); + return $obj->getValue(); +} +""", + ) + + parsers, queries = load_parsers() + assert "php" in parsers, "PHP parser should be available" + + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=project_path, + parsers=parsers, + queries=queries, + ) + updater.run() + + created_functions = [ + c + for c in mock_ingestor.ensure_node_batch.call_args_list + if c[0][0] == NodeType.FUNCTION + ] + fn_qns = {c[0][1]["qualified_name"] for c in created_functions} + + assert any(qn.endswith(".standaloneFunction") for qn in fn_qns), fn_qns + + call_rels = get_relationships(mock_ingestor, "CALLS") + assert len(call_rels) >= 1 + + +def test_php_class_discovery(temp_repo: Path, mock_ingestor: MagicMock) -> None: + project_path = temp_repo / "php_class_test" + project_path.mkdir() + + (project_path / "models.php").write_text( + encoding="utf-8", + data=""" None: + project_path = temp_repo / "php_calls_test" + project_path.mkdir() + + (project_path / "service.php").write_text( + encoding="utf-8", + data="""add(1, 2); + } +} + +function main() { + $calc = new Calculator(); + $calc->calculate(); +} +""", + ) + + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=project_path, + parsers=parsers, + queries=queries, + ) + updater.run() + + call_rels = get_relationships(mock_ingestor, "CALLS") + assert len(call_rels) >= 2 diff --git a/codebase_rag/tests/test_php_imports.py b/codebase_rag/tests/test_php_imports.py new file mode 100644 index 000000000..9f8e2ef59 --- /dev/null +++ b/codebase_rag/tests/test_php_imports.py @@ -0,0 +1,93 @@ +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.tests.conftest import get_relationships + + +def test_php_use_statement_import(temp_repo: Path, mock_ingestor: MagicMock) -> None: + project_path = temp_repo / "php_imports_test" + project_path.mkdir() + + (project_path / "Controller.php").write_text( + encoding="utf-8", + data="""= 1 + + controller_module = f"{project_path.name}.Controller" + import_mapping = updater.factory.import_processor.import_mapping + if controller_module in import_mapping: + mapping = import_mapping[controller_module] + assert "ProductService" in mapping + assert mapping["ProductService"] == "App.Service.ProductService" + assert "Repo" in mapping + assert mapping["Repo"] == "App.Repository.ProductRepository" + + +def test_php_multiple_use_statements(temp_repo: Path, mock_ingestor: MagicMock) -> None: + project_path = temp_repo / "php_multi_imports" + project_path.mkdir() + + (project_path / "app.php").write_text( + encoding="utf-8", + data=""" tuple[dict, dict]: + return load_parsers() + + +def _make_updater( + repo_path: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + project_name: str | None = None, +) -> GraphUpdater: + parsers, queries = parsers_and_queries + return GraphUpdater( + ingestor=mock_ingestor, + repo_path=repo_path, + parsers=parsers, + queries=queries, + project_name=project_name, + ) + + +def _write_python_file(repo_path: Path, rel_path: str, content: str) -> None: + full = repo_path / rel_path + full.parent.mkdir(parents=True, exist_ok=True) + full.write_text(content) + + +class TestDefaultProjectName: + def test_default_uses_directory_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater(temp_repo, mock_ingestor, parsers_and_queries) + assert updater.project_name == temp_repo.resolve().name + + def test_default_none_uses_directory_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name=None + ) + assert updater.project_name == temp_repo.resolve().name + + def test_default_empty_string_uses_directory_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="" + ) + assert updater.project_name == temp_repo.resolve().name + + def test_default_whitespace_only_uses_directory_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name=" " + ) + assert updater.project_name == temp_repo.resolve().name + + +class TestExplicitProjectName: + def test_override_simple( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="MyProject" + ) + assert updater.project_name == "MyProject" + + def test_override_with_hyphens( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, + mock_ingestor, + parsers_and_queries, + project_name="my-cool-project", + ) + assert updater.project_name == "my-cool-project" + + def test_override_with_dots( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, + mock_ingestor, + parsers_and_queries, + project_name="com.example.app", + ) + assert updater.project_name == "com.example.app" + + +class TestEdgeCases: + def test_generic_dir_name_src( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + src_dir = temp_repo / "src" + src_dir.mkdir() + updater = _make_updater( + src_dir, mock_ingestor, parsers_and_queries, project_name="BlazingRenderer" + ) + assert updater.project_name == "BlazingRenderer" + updater_default = _make_updater(src_dir, mock_ingestor, parsers_and_queries) + assert updater_default.project_name == "src" + + def test_generic_dir_name_main( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + main_dir = temp_repo / "main" + main_dir.mkdir() + updater = _make_updater( + main_dir, + mock_ingestor, + parsers_and_queries, + project_name="ActualProjectName", + ) + assert updater.project_name == "ActualProjectName" + + def test_version_named_directory( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + ver_dir = temp_repo / "v1.3.2" + ver_dir.mkdir() + updater = _make_updater( + ver_dir, mock_ingestor, parsers_and_queries, project_name="my-library" + ) + assert updater.project_name == "my-library" + updater_default = _make_updater(ver_dir, mock_ingestor, parsers_and_queries) + assert updater_default.project_name == "v1.3.2" + + def test_nested_same_name_parent( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + nested = temp_repo / "BRender" / "BlazingRenderer" + nested.mkdir(parents=True) + updater = _make_updater( + nested, mock_ingestor, parsers_and_queries, project_name="BlazingRenderer" + ) + assert updater.project_name == "BlazingRenderer" + + +class TestFactoryPropagation: + def test_factory_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.project_name == "CustomName" + + def test_factory_default_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater(temp_repo, mock_ingestor, parsers_and_queries) + assert updater.factory.project_name == temp_repo.resolve().name + + def test_structure_processor_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.structure_processor.project_name == "CustomName" + + def test_import_processor_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.import_processor.project_name == "CustomName" + + def test_definition_processor_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.definition_processor.project_name == "CustomName" + + def test_call_processor_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.call_processor.project_name == "CustomName" + + def test_type_inference_receives_project_name( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomName" + ) + assert updater.factory.type_inference.project_name == "CustomName" + + +class TestQualifiedNameIntegration: + def test_module_qualified_names_use_override( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "hello.py", "def greet():\n pass\n") + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="MyApp" + ) + updater.run(force=True) + module_names = get_node_names(mock_ingestor, "Module") + assert "MyApp.hello" in module_names + + def test_function_qualified_names_use_override( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "utils.py", "def helper():\n return 42\n") + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="MyApp" + ) + updater.run(force=True) + func_names = get_node_names(mock_ingestor, "Function") + assert "MyApp.utils.helper" in func_names + + def test_class_qualified_names_use_override( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "models.py", "class User:\n pass\n") + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="MyApp" + ) + updater.run(force=True) + class_names = get_node_names(mock_ingestor, "Class") + assert "MyApp.models.User" in class_names + + def test_default_qualified_names_use_directory( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "foo.py", "def bar():\n pass\n") + updater = _make_updater(temp_repo, mock_ingestor, parsers_and_queries) + updater.run(force=True) + dir_name = temp_repo.resolve().name + func_names = get_node_names(mock_ingestor, "Function") + assert f"{dir_name}.foo.bar" in func_names + + def test_package_qualified_names_use_override( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "pkg/__init__.py", "") + _write_python_file(temp_repo, "pkg/core.py", "def run():\n pass\n") + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="CustomProj" + ) + updater.run(force=True) + func_names = get_node_names(mock_ingestor, "Function") + assert "CustomProj.pkg.core.run" in func_names + + def test_override_vs_default_different_names( + self, + temp_repo: Path, + mock_ingestor: MagicMock, + parsers_and_queries: tuple[dict, dict], + ) -> None: + _write_python_file(temp_repo, "app.py", "def main():\n pass\n") + dir_name = temp_repo.resolve().name + updater = _make_updater( + temp_repo, mock_ingestor, parsers_and_queries, project_name="OverrideName" + ) + updater.run(force=True) + func_names = get_node_names(mock_ingestor, "Function") + assert "OverrideName.app.main" in func_names + assert f"{dir_name}.app.main" not in func_names diff --git a/codebase_rag/tests/test_provider_classes.py b/codebase_rag/tests/test_provider_classes.py index 1475914a0..d7b0eb9c3 100644 --- a/codebase_rag/tests/test_provider_classes.py +++ b/codebase_rag/tests/test_provider_classes.py @@ -9,6 +9,8 @@ from codebase_rag.constants import GoogleProviderType, Provider from codebase_rag.providers.base import ( + AnthropicProvider, + AzureOpenAIProvider, GoogleProvider, ModelProvider, OllamaProvider, @@ -37,16 +39,42 @@ def test_get_valid_providers(self) -> None: assert isinstance(ollama_provider, OllamaProvider) assert ollama_provider.provider_name == Provider.OLLAMA + anthropic_provider = get_provider(Provider.ANTHROPIC, api_key="test-key") + assert isinstance(anthropic_provider, AnthropicProvider) + assert anthropic_provider.provider_name == Provider.ANTHROPIC + + azure_provider = get_provider( + Provider.AZURE, + api_key="test-key", + endpoint="https://myresource.openai.azure.com", + ) + assert isinstance(azure_provider, AzureOpenAIProvider) + assert azure_provider.provider_name == Provider.AZURE + def test_get_invalid_provider(self) -> None: with pytest.raises(ValueError, match="Unknown provider 'invalid_provider'"): get_provider("invalid_provider") + def test_get_litellm_provider(self) -> None: + litellm_provider = get_provider( + Provider.LITELLM_PROXY, + api_key="sk-test", + endpoint="http://localhost:4000/v1", + ) + from codebase_rag.providers.litellm import LiteLLMProvider + + assert isinstance(litellm_provider, LiteLLMProvider) + assert litellm_provider.provider_name == Provider.LITELLM_PROXY + def test_list_providers(self) -> None: providers = list_providers() assert Provider.GOOGLE in providers assert Provider.OPENAI in providers assert Provider.OLLAMA in providers - assert len(providers) >= 3 + assert Provider.ANTHROPIC in providers + assert Provider.AZURE in providers + assert Provider.LITELLM_PROXY in providers + assert len(providers) >= 6 def test_register_custom_provider(self) -> None: class CustomProvider(ModelProvider): @@ -190,6 +218,94 @@ def test_ollama_validation_connection_error(self, mock_client: Any) -> None: provider.validate_config() +class TestAnthropicProvider: + def test_anthropic_configuration(self) -> None: + provider = AnthropicProvider(api_key="sk-ant-test-key") + assert provider.provider_name == Provider.ANTHROPIC + assert provider.api_key == "sk-ant-test-key" + provider.validate_config() + + def test_anthropic_validation_error(self) -> None: + provider = AnthropicProvider() + with pytest.raises(ValueError, match="Anthropic provider requires api_key"): + provider.validate_config() + + @patch("codebase_rag.providers.base.PydanticAnthropicProvider") + @patch("codebase_rag.providers.base.AnthropicModel") + def test_anthropic_model_creation( + self, mock_anthropic_model: Any, mock_anthropic_provider: Any + ) -> None: + provider = AnthropicProvider(api_key="sk-ant-test-key") + mock_model = MagicMock() + mock_anthropic_model.return_value = mock_model + result = provider.create_model("claude-opus-4-6") + mock_anthropic_model.assert_called_once() + assert result == mock_model + + def test_anthropic_api_key_from_env(self) -> None: + with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "env-key"}): + provider = AnthropicProvider() + assert provider.api_key == "env-key" + + +class TestAzureOpenAIProvider: + def test_azure_configuration(self) -> None: + provider = AzureOpenAIProvider( + api_key="azure-key", + endpoint="https://myresource.openai.azure.com", + api_version="2024-06-01", + ) + assert provider.provider_name == Provider.AZURE + assert provider.api_key == "azure-key" + assert provider.endpoint == "https://myresource.openai.azure.com" + assert provider.api_version == "2024-06-01" + provider.validate_config() + + def test_azure_validation_error_no_key(self) -> None: + provider = AzureOpenAIProvider(endpoint="https://myresource.openai.azure.com") + with pytest.raises(ValueError, match="Azure OpenAI provider requires api_key"): + provider.validate_config() + + def test_azure_validation_error_no_endpoint(self) -> None: + provider = AzureOpenAIProvider(api_key="azure-key") + with pytest.raises(ValueError, match="Azure OpenAI provider requires endpoint"): + provider.validate_config() + + @patch("codebase_rag.providers.base.PydanticAzureProvider") + @patch("codebase_rag.providers.base.OpenAIChatModel") + def test_azure_model_creation( + self, mock_chat_model: Any, mock_azure_provider: Any + ) -> None: + provider = AzureOpenAIProvider( + api_key="azure-key", + endpoint="https://myresource.openai.azure.com", + ) + mock_model = MagicMock() + mock_chat_model.return_value = mock_model + result = provider.create_model("gpt-4o") + mock_azure_provider.assert_called_once_with( + api_key="azure-key", + azure_endpoint="https://myresource.openai.azure.com", + api_version=None, + ) + mock_chat_model.assert_called_once_with( + "gpt-4o", provider=mock_azure_provider.return_value + ) + assert result == mock_model + + def test_azure_api_key_from_env(self) -> None: + with patch.dict( + "os.environ", + { + "AZURE_API_KEY": "env-key", + "AZURE_OPENAI_ENDPOINT": "https://env.openai.azure.com", + }, + ): + provider = AzureOpenAIProvider() + assert provider.api_key == "env-key" + assert provider.endpoint == "https://env.openai.azure.com" + + class TestModelCreation: @patch("codebase_rag.providers.base.PydanticGoogleProvider") @patch("codebase_rag.providers.base.GoogleModel") @@ -275,3 +391,109 @@ def test_ollama_model_creation( mock_openai_provider.assert_called_once_with( api_key="ollama", base_url="http://localhost:11434/v1" ) + + +class TestLiteLLMProvider: + def test_litellm_configuration(self) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + provider = LiteLLMProvider( + api_key="sk-litellm-key", endpoint="http://litellm:4000/v1" + ) + assert provider.provider_name == Provider.LITELLM_PROXY + assert provider.api_key == "sk-litellm-key" + assert provider.endpoint == "http://litellm:4000/v1" + + def test_litellm_default_endpoint(self) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + provider = LiteLLMProvider() + assert provider.endpoint == "http://localhost:4000/v1" + + def test_litellm_no_endpoint_validation_error(self) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + provider = LiteLLMProvider(endpoint="") + with pytest.raises(ValueError, match="LiteLLM provider requires endpoint"): + provider.validate_config() + + @patch("httpx.Client") + def test_litellm_validation_success(self, mock_client: Any) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_client.return_value.__enter__.return_value.get.return_value = mock_response + + provider = LiteLLMProvider(api_key="sk-test", endpoint="http://litellm:4000/v1") + provider.validate_config() + + @patch("httpx.Client") + def test_litellm_validation_server_not_running(self, mock_client: Any) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + mock_response = MagicMock() + mock_response.status_code = 404 + mock_client.return_value.__enter__.return_value.get.return_value = mock_response + + provider = LiteLLMProvider(endpoint="http://litellm:4000/v1") + with pytest.raises(ValueError, match="LiteLLM proxy server not responding"): + provider.validate_config() + + @patch("httpx.Client") + def test_litellm_validation_fallback_to_models_endpoint( + self, mock_client: Any + ) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + health_response = MagicMock() + health_response.status_code = 401 + models_response = MagicMock() + models_response.status_code = 200 + mock_client.return_value.__enter__.return_value.get.side_effect = [ + health_response, + models_response, + ] + + provider = LiteLLMProvider(api_key="sk-test", endpoint="http://litellm:4000/v1") + provider.validate_config() + + @patch("httpx.Client") + def test_litellm_validation_connection_error(self, mock_client: Any) -> None: + import httpx + + from codebase_rag.providers.litellm import LiteLLMProvider + + mock_client.return_value.__enter__.return_value.get.side_effect = ( + httpx.ConnectError("Connection failed") + ) + + provider = LiteLLMProvider(endpoint="http://litellm:4000/v1") + with pytest.raises(ValueError, match="LiteLLM proxy server not responding"): + provider.validate_config() + + @patch("codebase_rag.providers.litellm.PydanticLiteLLMProvider") + @patch("codebase_rag.providers.litellm.OpenAIChatModel") + @patch("httpx.Client") + def test_litellm_model_creation( + self, mock_client: Any, mock_chat_model: Any, mock_litellm_provider: Any + ) -> None: + from codebase_rag.providers.litellm import LiteLLMProvider + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_client.return_value.__enter__.return_value.get.return_value = mock_response + + provider = LiteLLMProvider(api_key="sk-test", endpoint="http://litellm:4000/v1") + mock_model = MagicMock() + mock_chat_model.return_value = mock_model + + result = provider.create_model("openai/gpt-4o") + + mock_litellm_provider.assert_called_once_with( + api_key="sk-test", api_base="http://litellm:4000/v1" + ) + mock_chat_model.assert_called_once_with( + "openai/gpt-4o", provider=mock_litellm_provider.return_value + ) + assert result == mock_model diff --git a/codebase_rag/tests/test_python_nested_functions.py b/codebase_rag/tests/test_python_nested_functions.py index 66f64b989..2a164d94d 100644 --- a/codebase_rag/tests/test_python_nested_functions.py +++ b/codebase_rag/tests/test_python_nested_functions.py @@ -318,10 +318,6 @@ def main(): def test_function_in_class_method( nested_functions_project: Path, mock_ingestor: MagicMock ) -> None: - """Test that functions inside class methods are properly handled. - - Note: Functions inside methods are currently treated as methods rather than nested functions. - """ parsers, queries = load_parsers() updater = GraphUpdater( @@ -333,21 +329,51 @@ def test_function_in_class_method( updater.run() project_name = nested_functions_project.name - - expected_method_qn = f"{project_name}.nested_functions.OuterClass.nested_in_method" - created_methods = get_node_names(mock_ingestor, "Method") - assert expected_method_qn in created_methods, ( - f"Function in method not found as method: {expected_method_qn}" + assert ( + f"{project_name}.nested_functions.OuterClass.method_with_nested" + in created_methods + ) + + nested_qn = f"{project_name}.nested_functions.OuterClass.nested_in_method" + assert nested_qn not in created_methods, ( + f"Nested function inside method should not be ingested as class method: {nested_qn}" ) - expected_class_methods = [ - f"{project_name}.nested_functions.OuterClass.method_with_nested", - f"{project_name}.nested_functions.OuterClass.nested_in_method", - ] - for expected_method in expected_class_methods: - assert expected_method in created_methods, ( - f"Expected method not found: {expected_method}" +def test_nested_function_in_staticmethod_not_ingested_as_method( + temp_repo: Path, mock_ingestor: MagicMock +) -> None: + project_path = temp_repo / "static_nested" + os.makedirs(project_path) + (project_path / "__init__.py").touch() + + with open(project_path / "api.py", "w") as f: + f.write( + "class Api:\n" + " @staticmethod\n" + " def say_hello():\n" + " def test_func():\n" + ' print("api")\n' + " pass\n" ) + + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=project_path, + parsers=parsers, + queries=queries, + ) + updater.run() + + project_name = project_path.name + created_methods = get_node_names(mock_ingestor, "Method") + + assert f"{project_name}.api.Api.say_hello" in created_methods + + bad_qn = f"{project_name}.api.Api.test_func" + assert bad_qn not in created_methods, ( + f"Nested function inside staticmethod should not be ingested as class method: {bad_qn}" + ) diff --git a/codebase_rag/tests/test_python_real_world.py b/codebase_rag/tests/test_python_real_world.py index 770014655..0243e2f04 100644 --- a/codebase_rag/tests/test_python_real_world.py +++ b/codebase_rag/tests/test_python_real_world.py @@ -874,24 +874,20 @@ class PlainTaskSchema(Schema): return project_path -def test_flask_model_calls( +def test_flask_no_calls_to_class_nodes( todo_app_project: Path, mock_ingestor: MagicMock, ) -> None: - """Test detection of model usage in controllers.""" + """Test that Class nodes are not targets of CALLS relationships.""" run_updater(todo_app_project, mock_ingestor) function_calls = get_relationships(mock_ingestor, "CALLS") - model_usage_calls = [ - call - for call in function_calls - if "task_controller" in call.args[0][2] and "TaskModel" in call.args[2][2] - ] + class_calls = [call for call in function_calls if call.args[2][0] == "Class"] - assert model_usage_calls, ( - f"Expected TaskController to use TaskModel, found: " - f"{[(c.args[0][2], c.args[2][2]) for c in model_usage_calls]}" + assert not class_calls, ( + f"Expected no CALLS edges to Class nodes, found: " + f"{[(c.args[0][2], c.args[2][2]) for c in class_calls]}" ) diff --git a/codebase_rag/tests/test_python_standard_library_imports.py b/codebase_rag/tests/test_python_standard_library_imports.py index c7cfa891e..98ec5f673 100644 --- a/codebase_rag/tests/test_python_standard_library_imports.py +++ b/codebase_rag/tests/test_python_standard_library_imports.py @@ -11,10 +11,10 @@ class TestStandardLibraryImports: """Test import resolution for standard library vs local modules.""" @pytest.fixture - def mock_updater(self) -> GraphUpdater: + def mock_updater(self, tmp_path: Path) -> GraphUpdater: mock_ingestor = MagicMock() - test_repo = Path("/tmp/myproject") + test_repo = tmp_path / "myproject" test_repo.mkdir(exist_ok=True) (test_repo / "utils").mkdir(exist_ok=True) diff --git a/codebase_rag/tests/test_query_truncation.py b/codebase_rag/tests/test_query_truncation.py new file mode 100644 index 000000000..d1ea8854e --- /dev/null +++ b/codebase_rag/tests/test_query_truncation.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from codebase_rag.tools.codebase_query import create_query_tool +from codebase_rag.types_defs import ResultRow + + +@pytest.fixture +def mock_ingestor() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def mock_cypher_gen() -> MagicMock: + gen = MagicMock() + gen.generate = AsyncMock(return_value="MATCH (n) RETURN n") + return gen + + +class TestQueryTruncation: + @pytest.mark.asyncio + async def test_row_cap_truncation( + self, mock_ingestor: MagicMock, mock_cypher_gen: MagicMock + ) -> None: + rows: list[ResultRow] = [{"name": f"node_{i}"} for i in range(600)] + mock_ingestor.fetch_all.return_value = rows + + tool = create_query_tool(mock_ingestor, mock_cypher_gen) + with patch("codebase_rag.tools.codebase_query.settings") as mock_settings: + mock_settings.QUERY_RESULT_ROW_CAP = 500 + mock_settings.QUERY_RESULT_MAX_TOKENS = 100000 + result = await tool.function(natural_language_query="list all nodes") + + assert len(result.results) <= 500 + assert "truncated" in result.summary.lower() or "600" in result.summary + + @pytest.mark.asyncio + async def test_token_truncation( + self, mock_ingestor: MagicMock, mock_cypher_gen: MagicMock + ) -> None: + rows: list[ResultRow] = [ + {"name": f"function_{i}", "body": f"def func_{i}(): pass # {'x' * 200}"} + for i in range(100) + ] + mock_ingestor.fetch_all.return_value = rows + + tool = create_query_tool(mock_ingestor, mock_cypher_gen) + with patch("codebase_rag.tools.codebase_query.settings") as mock_settings: + mock_settings.QUERY_RESULT_ROW_CAP = 500 + mock_settings.QUERY_RESULT_MAX_TOKENS = 500 + result = await tool.function(natural_language_query="list functions") + + assert len(result.results) < 100 + assert "truncated" in result.summary.lower() + + @pytest.mark.asyncio + async def test_no_truncation_when_within_limits( + self, mock_ingestor: MagicMock, mock_cypher_gen: MagicMock + ) -> None: + rows: list[ResultRow] = [{"name": f"node_{i}"} for i in range(5)] + mock_ingestor.fetch_all.return_value = rows + + tool = create_query_tool(mock_ingestor, mock_cypher_gen) + with patch("codebase_rag.tools.codebase_query.settings") as mock_settings: + mock_settings.QUERY_RESULT_ROW_CAP = 500 + mock_settings.QUERY_RESULT_MAX_TOKENS = 16000 + result = await tool.function(natural_language_query="small query") + + assert len(result.results) == 5 + assert "Successfully" in result.summary diff --git a/codebase_rag/tests/test_realtime_debounce.py b/codebase_rag/tests/test_realtime_debounce.py new file mode 100644 index 000000000..eee1fcf48 --- /dev/null +++ b/codebase_rag/tests/test_realtime_debounce.py @@ -0,0 +1,445 @@ +""" +Tests for the realtime_updater debouncing functionality. + +These tests verify the hybrid debounce strategy that prevents redundant +graph updates during rapid file saves. +""" + +from __future__ import annotations + +import threading +import time +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock + +import pytest +from watchdog.events import FileCreatedEvent, FileDeletedEvent, FileModifiedEvent + +from codebase_rag.constants import DEFAULT_DEBOUNCE_SECONDS, DEFAULT_MAX_WAIT_SECONDS +from codebase_rag.services import QueryProtocol + + +class MockQueryIngestor: + def __init__(self) -> None: + self.execute_write = MagicMock() + self.flush_all = MagicMock() + self.fetch_all = MagicMock(return_value=[]) + self.ensure_node_batch = MagicMock() + self.ensure_relationship_batch = MagicMock() + + def __enter__(self) -> MockQueryIngestor: + return self + + def __exit__(self, *args: Any) -> None: + pass + + +# Register MockQueryIngestor as implementing QueryProtocol for isinstance checks +QueryProtocol.register(MockQueryIngestor) + + +class TestCodeChangeEventHandlerDebounce: + @pytest.fixture(autouse=True) + def _patch_ignore(self, monkeypatch: pytest.MonkeyPatch) -> None: + from codebase_rag import constants as cs + + patched = cs.IGNORE_PATTERNS - {"tmp"} + monkeypatch.setattr(cs, "IGNORE_PATTERNS", patched) + monkeypatch.setattr("realtime_updater.IGNORE_PATTERNS", patched) + + @pytest.fixture + def mock_ingestor(self) -> MockQueryIngestor: + return MockQueryIngestor() + + @pytest.fixture + def mock_updater( + self, tmp_path: Path, mock_ingestor: MockQueryIngestor + ) -> MagicMock: + updater = MagicMock() + updater.repo_path = tmp_path + updater.ingestor = mock_ingestor + updater.remove_file_from_state = MagicMock() + updater.factory = MagicMock() + updater.factory.definition_processor.process_file = MagicMock(return_value=None) + updater._process_function_calls = MagicMock() + updater.parsers = {} + updater.queries = {} + updater.ast_cache = {} + return updater + + @pytest.fixture + def sample_file(self, tmp_path: Path) -> Path: + test_file = tmp_path / "test.py" + test_file.write_text("# test file") + return test_file + + def test_handler_initialization_with_debounce( + self, mock_updater: MagicMock + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=5, max_wait_seconds=30 + ) + + assert handler.debounce_seconds == 5 + assert handler.max_wait_seconds == 30 + assert handler.debounce_enabled is True + assert len(handler.timers) == 0 + assert len(handler.first_event_time) == 0 + assert len(handler.pending_events) == 0 + + def test_handler_initialization_without_debounce( + self, mock_updater: MagicMock + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0, max_wait_seconds=30 + ) + + assert handler.debounce_seconds == 0 + assert handler.debounce_enabled is False + + def test_handler_uses_default_constants(self, mock_updater: MagicMock) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler(mock_updater) + + assert handler.debounce_seconds == DEFAULT_DEBOUNCE_SECONDS + assert handler.max_wait_seconds == DEFAULT_MAX_WAIT_SECONDS + + def test_is_relevant_filters_ignored_patterns( + self, mock_updater: MagicMock, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler(mock_updater) + + # Should be ignored (directories in ignore patterns) + assert handler._is_relevant(str(tmp_path / ".git" / "config")) is False + assert handler._is_relevant(str(tmp_path / "node_modules" / "pkg.js")) is False + assert handler._is_relevant(str(tmp_path / "__pycache__" / "mod.pyc")) is False + + # Should be relevant + assert handler._is_relevant(str(tmp_path / "main.py")) is True + assert handler._is_relevant(str(tmp_path / "src" / "lib.rs")) is True + assert handler._is_relevant(str(tmp_path / "app.js")) is True + + def test_dispatch_ignores_directories( + self, mock_updater: MagicMock, mock_ingestor: MockQueryIngestor, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.1, max_wait_seconds=1 + ) + + # Create event that is marked as directory + event = FileModifiedEvent(str(tmp_path / "some_dir")) + # The is_directory property is set by watchdog based on the event type + # For FileModifiedEvent, we need to check is_directory attribute + object.__setattr__(event, "is_directory", True) + + handler.dispatch(event) + + # No timer should be created for directory events + assert len(handler.timers) == 0 + mock_ingestor.execute_write.assert_not_called() + + def test_debounce_batches_rapid_events( + self, + mock_updater: MagicMock, + mock_ingestor: MockQueryIngestor, + sample_file: Path, + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.2, max_wait_seconds=5 + ) + + # Simulate 5 rapid saves + for _ in range(5): + event = FileModifiedEvent(str(sample_file)) + handler.dispatch(event) + time.sleep(0.05) # 50ms between saves + + # Should have one pending event + assert len(handler.pending_events) == 1 + + # Wait for debounce to complete + time.sleep(0.4) + + # After debounce, ingestor should have been called only once + mock_ingestor.flush_all.assert_called_once() + + def test_no_debounce_processes_immediately( + self, + mock_updater: MagicMock, + mock_ingestor: MockQueryIngestor, + sample_file: Path, + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0, max_wait_seconds=30 + ) + + event = FileModifiedEvent(str(sample_file)) + handler.dispatch(event) + + # Should process immediately (no pending events) + assert len(handler.pending_events) == 0 + assert len(handler.timers) == 0 + mock_ingestor.flush_all.assert_called_once() + + def test_max_wait_forces_update( + self, + mock_updater: MagicMock, + mock_ingestor: MockQueryIngestor, + sample_file: Path, + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.5, max_wait_seconds=0.3 + ) + + # First event + event = FileModifiedEvent(str(sample_file)) + handler.dispatch(event) + + # Wait until max_wait is exceeded + time.sleep(0.4) + + # Second event should trigger immediate processing due to max_wait + event2 = FileModifiedEvent(str(sample_file)) + handler.dispatch(event2) + + # Give time for processing + time.sleep(0.15) + + # Should have processed at least once due to max_wait + assert mock_ingestor.flush_all.call_count >= 1 + + def test_different_files_tracked_separately( + self, mock_updater: MagicMock, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + file1 = tmp_path / "file1.py" + file2 = tmp_path / "file2.py" + file1.write_text("# file 1") + file2.write_text("# file 2") + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.2, max_wait_seconds=5 + ) + + # Events for different files + event1 = FileModifiedEvent(str(file1)) + event2 = FileModifiedEvent(str(file2)) + + handler.dispatch(event1) + handler.dispatch(event2) + + # Should have two pending events + assert len(handler.pending_events) == 2 + assert len(handler.timers) == 2 + + def test_timer_cleanup_after_processing( + self, + mock_updater: MagicMock, + mock_ingestor: MockQueryIngestor, + sample_file: Path, + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.1, max_wait_seconds=5 + ) + + event = FileModifiedEvent(str(sample_file)) + handler.dispatch(event) + + # Should have pending state + assert len(handler.pending_events) == 1 + assert len(handler.first_event_time) == 1 + + # Wait for processing + time.sleep(0.25) + + # State should be cleaned up + assert len(handler.pending_events) == 0 + assert len(handler.first_event_time) == 0 + assert len(handler.timers) == 0 + + def test_created_event_triggers_debounce( + self, mock_updater: MagicMock, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + new_file = tmp_path / "new_file.py" + new_file.write_text("# new file") + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.2, max_wait_seconds=5 + ) + + event = FileCreatedEvent(str(new_file)) + handler.dispatch(event) + + assert len(handler.pending_events) == 1 + + def test_deleted_event_triggers_debounce( + self, mock_updater: MagicMock, sample_file: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.2, max_wait_seconds=5 + ) + + event = FileDeletedEvent(str(sample_file)) + handler.dispatch(event) + + assert len(handler.pending_events) == 1 + + def test_thread_safety_concurrent_events( + self, mock_updater: MagicMock, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=5.0, max_wait_seconds=30 + ) + + files = [tmp_path / f"file{i}.py" for i in range(10)] + for f in files: + f.write_text(f"# {f.name}") + + def send_events(file_path: Path) -> None: + for _ in range(5): + event = FileModifiedEvent(str(file_path)) + handler.dispatch(event) + time.sleep(0.02) + + # Send events from multiple threads + threads = [threading.Thread(target=send_events, args=(f,)) for f in files[:5]] + for t in threads: + t.start() + for t in threads: + t.join() + + # Should have 5 pending events (one per file) + assert len(handler.pending_events) == 5 + + +class TestDebounceValidation: + def test_validate_non_negative_float_accepts_zero(self) -> None: + from realtime_updater import _validate_non_negative_float + + assert _validate_non_negative_float(0) == 0 + assert _validate_non_negative_float(0.0) == 0.0 + + def test_validate_non_negative_float_accepts_positive(self) -> None: + from realtime_updater import _validate_non_negative_float + + assert _validate_non_negative_float(5) == 5 + assert _validate_non_negative_float(0.5) == 0.5 + assert _validate_non_negative_float(100) == 100 + + def test_validate_non_negative_float_rejects_negative(self) -> None: + import typer + + from realtime_updater import _validate_non_negative_float + + with pytest.raises(typer.BadParameter): + _validate_non_negative_float(-1) + + with pytest.raises(typer.BadParameter): + _validate_non_negative_float(-0.1) + + +class TestDebounceIntegration: + @pytest.fixture(autouse=True) + def _patch_ignore(self, monkeypatch: pytest.MonkeyPatch) -> None: + from codebase_rag import constants as cs + + patched = cs.IGNORE_PATTERNS - {"tmp"} + monkeypatch.setattr(cs, "IGNORE_PATTERNS", patched) + monkeypatch.setattr("realtime_updater.IGNORE_PATTERNS", patched) + + @pytest.fixture + def mock_ingestor(self) -> MockQueryIngestor: + return MockQueryIngestor() + + @pytest.fixture + def mock_updater( + self, tmp_path: Path, mock_ingestor: MockQueryIngestor + ) -> MagicMock: + updater = MagicMock() + updater.repo_path = tmp_path + updater.ingestor = mock_ingestor + updater.remove_file_from_state = MagicMock() + updater.factory = MagicMock() + updater.factory.definition_processor.process_file = MagicMock(return_value=None) + updater._process_function_calls = MagicMock() + updater.parsers = {} + updater.queries = {} + updater.ast_cache = {} + return updater + + def test_realistic_rapid_save_scenario( + self, mock_updater: MagicMock, mock_ingestor: MockQueryIngestor, tmp_path: Path + ) -> None: + """ + Simulate realistic rapid save scenario: + - User saves file 10 times over 3 seconds + - With 0.5s debounce and 2s max_wait, should result in ~2-4 updates + """ + from realtime_updater import CodeChangeEventHandler + + test_file = tmp_path / "editor.py" + test_file.write_text("# editing") + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.5, max_wait_seconds=2 + ) + + # Simulate 10 saves over 3 seconds + for i in range(10): + event = FileModifiedEvent(str(test_file)) + handler.dispatch(event) + time.sleep(0.3) + + # Wait for final debounce + time.sleep(0.7) + + # Should have batched into fewer updates due to max_wait and debounce + # With max_wait=2s and 3s total time, expect ~2-4 updates + call_count = mock_ingestor.flush_all.call_count + assert 1 <= call_count <= 4, f"Expected 1-4 updates, got {call_count}" + + def test_single_edit_after_quiet_period( + self, mock_updater: MagicMock, mock_ingestor: MockQueryIngestor, tmp_path: Path + ) -> None: + from realtime_updater import CodeChangeEventHandler + + test_file = tmp_path / "single.py" + test_file.write_text("# single edit") + + handler = CodeChangeEventHandler( + mock_updater, debounce_seconds=0.1, max_wait_seconds=5 + ) + + event = FileModifiedEvent(str(test_file)) + handler.dispatch(event) + + # Wait for debounce + time.sleep(0.25) + + # Should have exactly one update + mock_ingestor.flush_all.assert_called_once() diff --git a/codebase_rag/tests/test_realtime_event_filtering.py b/codebase_rag/tests/test_realtime_event_filtering.py new file mode 100644 index 000000000..68f641d93 --- /dev/null +++ b/codebase_rag/tests/test_realtime_event_filtering.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Protocol, runtime_checkable +from unittest.mock import MagicMock + +import pytest +from watchdog.events import ( + FileClosedNoWriteEvent, + FileCreatedEvent, + FileDeletedEvent, + FileModifiedEvent, + FileOpenedEvent, + FileSystemEvent, +) + +from codebase_rag import constants as cs +from realtime_updater import CodeChangeEventHandler + + +@runtime_checkable +class _AnyProtocol(Protocol): + pass + + +@pytest.fixture(autouse=True) +def _bypass_protocol_check(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("realtime_updater.QueryProtocol", _AnyProtocol) + + +@pytest.fixture +def handler(mock_updater: MagicMock) -> CodeChangeEventHandler: + h = CodeChangeEventHandler(mock_updater, debounce_seconds=0) + h.ignore_patterns = h.ignore_patterns - {"tmp", "temp"} + return h + + +def _make_event(event_type: str, src_path: str) -> FileSystemEvent: + ev = MagicMock(spec=FileSystemEvent) + ev.event_type = event_type + ev.src_path = src_path + ev.is_directory = False + return ev + + +class TestEventFiltering: + def test_modified_event_is_processed( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "app.py" + f.write_text("x = 1", encoding="utf-8") + handler.dispatch(FileModifiedEvent(str(f))) + assert mock_updater.ingestor.execute_write.call_count == 3 + + def test_created_event_is_processed( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "new.py" + f.write_text("y = 2", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + assert mock_updater.ingestor.execute_write.call_count == 3 + mock_updater.ingestor.flush_all.assert_called_once() + + def test_deleted_event_is_processed( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "gone.py" + handler.dispatch(FileDeletedEvent(str(f))) + assert mock_updater.ingestor.execute_write.call_count == 3 + mock_updater.factory.definition_processor.process_file.assert_not_called() + mock_updater.factory.structure_processor.process_generic_file.assert_not_called() + + def test_opened_event_is_ignored( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "read_only.py" + f.touch() + handler.dispatch(FileOpenedEvent(str(f))) + mock_updater.ingestor.execute_write.assert_not_called() + mock_updater.ingestor.flush_all.assert_not_called() + + def test_closed_no_write_event_is_ignored( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "viewed.py" + f.touch() + handler.dispatch(FileClosedNoWriteEvent(str(f))) + mock_updater.ingestor.execute_write.assert_not_called() + mock_updater.ingestor.flush_all.assert_not_called() + + def test_access_event_is_ignored( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "accessed.py" + f.touch() + ev = _make_event("access", str(f)) + handler.dispatch(ev) + mock_updater.ingestor.execute_write.assert_not_called() + mock_updater.ingestor.flush_all.assert_not_called() + + +class TestNonCodeFileHandling: + def test_markdown_file_creates_file_node( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "readme.md" + f.write_text("# Title", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + mock_updater.factory.structure_processor.process_generic_file.assert_called_once_with( + f, "readme.md" + ) + + def test_json_file_creates_file_node( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "config.json" + f.write_text("{}", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + mock_updater.factory.structure_processor.process_generic_file.assert_called_once_with( + f, "config.json" + ) + + def test_non_code_file_deletion_removes_file_node( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "notes.md" + handler.dispatch(FileDeletedEvent(str(f))) + delete_file_calls = [ + c + for c in mock_updater.ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_FILE + ] + assert len(delete_file_calls) == 1 + assert delete_file_calls[0].args[1] == { + cs.KEY_PATH: "notes.md", + } + mock_updater.factory.structure_processor.process_generic_file.assert_not_called() + + def test_non_code_file_has_no_module_node( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "data.md" + f.write_text("text", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + mock_updater.factory.definition_processor.process_file.assert_not_called() + + +class TestMixedEventSequences: + def test_rapid_create_modify_delete( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f = temp_repo / "ephemeral.py" + f.write_text("a = 1", encoding="utf-8") + handler.dispatch(FileCreatedEvent(str(f))) + + mock_updater.ingestor.reset_mock() + mock_updater.factory.reset_mock() + f.write_text("a = 2", encoding="utf-8") + handler.dispatch(FileModifiedEvent(str(f))) + + mock_updater.ingestor.reset_mock() + mock_updater.factory.reset_mock() + handler.dispatch(FileDeletedEvent(str(f))) + + # (H) After delete, no re-parse or file node creation + mock_updater.factory.definition_processor.process_file.assert_not_called() + mock_updater.factory.structure_processor.process_generic_file.assert_not_called() + assert mock_updater.ingestor.execute_write.call_count == 3 + mock_updater.ingestor.flush_all.assert_called_once() + + def test_multiple_files_changed( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f1 = temp_repo / "a.py" + f2 = temp_repo / "b.py" + f1.write_text("x = 1", encoding="utf-8") + f2.write_text("y = 2", encoding="utf-8") + + handler.dispatch(FileModifiedEvent(str(f1))) + handler.dispatch(FileModifiedEvent(str(f2))) + + assert mock_updater.ingestor.execute_write.call_count == 6 + assert mock_updater.ingestor.flush_all.call_count == 2 + + +class TestCypherDeleteFileQuery: + def test_delete_file_only_targets_specific_path( + self, handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path + ) -> None: + f1 = temp_repo / "keep.py" + f2 = temp_repo / "remove.py" + f1.write_text("a = 1", encoding="utf-8") + + handler.dispatch(FileDeletedEvent(str(f2))) + + delete_file_calls = [ + c + for c in mock_updater.ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_FILE + ] + assert len(delete_file_calls) == 1 + assert delete_file_calls[0].args[1] == {cs.KEY_PATH: "remove.py"} + + delete_module_calls = [ + c + for c in mock_updater.ingestor.execute_write.call_args_list + if c.args[0] == cs.CYPHER_DELETE_MODULE + ] + assert len(delete_module_calls) == 1 + assert delete_module_calls[0].args[1] == {cs.KEY_PATH: "remove.py"} diff --git a/codebase_rag/tests/test_realtime_updater.py b/codebase_rag/tests/test_realtime_updater.py index c53b5b6ae..fdf1b604a 100644 --- a/codebase_rag/tests/test_realtime_updater.py +++ b/codebase_rag/tests/test_realtime_updater.py @@ -1,4 +1,7 @@ +from __future__ import annotations + from pathlib import Path +from typing import Protocol, runtime_checkable from unittest.mock import MagicMock import pytest @@ -12,10 +15,21 @@ from realtime_updater import CodeChangeEventHandler +@runtime_checkable +class _AnyProtocol(Protocol): + pass + + +@pytest.fixture(autouse=True) +def _bypass_protocol_check(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("realtime_updater.QueryProtocol", _AnyProtocol) + + @pytest.fixture def event_handler(mock_updater: MagicMock) -> CodeChangeEventHandler: - """Provides a CodeChangeEventHandler instance with a mocked updater.""" - return CodeChangeEventHandler(mock_updater) + handler = CodeChangeEventHandler(mock_updater, debounce_seconds=0) + handler.ignore_patterns = handler.ignore_patterns - {"tmp", "temp"} + return handler def test_file_creation_flow( @@ -28,7 +42,8 @@ def test_file_creation_flow( event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 mock_updater.factory.definition_processor.process_file.assert_called_once_with( test_file, "python", @@ -48,7 +63,8 @@ def test_file_modification_flow( event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 mock_updater.factory.definition_processor.process_file.assert_called_once_with( test_file, "python", @@ -67,7 +83,8 @@ def test_file_deletion_flow( event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 mock_updater.factory.definition_processor.process_file.assert_not_called() mock_updater.ingestor.flush_all.assert_called_once() @@ -103,16 +120,22 @@ def test_directory_creation_is_ignored( mock_updater.ingestor.flush_all.assert_not_called() -def test_unsupported_file_types_are_ignored( +def test_non_code_files_create_file_nodes( event_handler: CodeChangeEventHandler, mock_updater: MagicMock, temp_repo: Path ) -> None: - """Test that changing an unsupported file type is ignored after deletion query.""" - unsupported_file = temp_repo / "document.md" - unsupported_file.write_text(encoding="utf-8", data="# Markdown file") - event = FileModifiedEvent(str(unsupported_file)) + """Test that non-code files (like .md) create File nodes but skip AST parsing.""" + non_code_file = temp_repo / "document.md" + non_code_file.write_text(encoding="utf-8", data="# Markdown file") + event = FileModifiedEvent(str(non_code_file)) event_handler.dispatch(event) - assert mock_updater.ingestor.execute_write.call_count == 2 + # (H) 3 execute_write calls: DELETE_MODULE, DELETE_FILE, DELETE_CALLS + assert mock_updater.ingestor.execute_write.call_count == 3 + # (H) AST parsing is skipped for non-code files mock_updater.factory.definition_processor.process_file.assert_not_called() + # (H) But File node creation IS called for all file types + mock_updater.factory.structure_processor.process_generic_file.assert_called_once_with( + non_code_file, "document.md" + ) mock_updater.ingestor.flush_all.assert_called_once() diff --git a/codebase_rag/tests/test_reconcile_embeddings.py b/codebase_rag/tests/test_reconcile_embeddings.py new file mode 100644 index 000000000..0e69f646e --- /dev/null +++ b/codebase_rag/tests/test_reconcile_embeddings.py @@ -0,0 +1,94 @@ +from collections.abc import Generator +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +from loguru import logger + +from codebase_rag.graph_updater import GraphUpdater +from codebase_rag.parser_loader import load_parsers +from codebase_rag.services.graph_service import MemgraphIngestor + + +@pytest.fixture +def updater(temp_repo: Path) -> GraphUpdater: + mock = MagicMock(spec=MemgraphIngestor) + mock.fetch_all = MagicMock(return_value=[]) + parsers, queries = load_parsers() + return GraphUpdater( + ingestor=mock, + repo_path=temp_repo, + parsers=parsers, + queries=queries, + ) + + +@pytest.fixture +def log_messages() -> Generator[list[str], None, None]: + messages: list[str] = [] + handler_id = logger.add(lambda msg: messages.append(str(msg)), level="DEBUG") + yield messages + logger.remove(handler_id) + + +class TestReconcileEmbeddings: + def test_noop_when_expected_empty(self, updater: GraphUpdater) -> None: + mock_fn = MagicMock() + updater._reconcile_embeddings(set(), mock_fn) + mock_fn.assert_not_called() + + def test_logs_ok_when_all_found( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + expected = {1, 2, 3} + mock_fn = MagicMock(return_value={1, 2, 3}) + + updater._reconcile_embeddings(expected, mock_fn) + + mock_fn.assert_called_once_with(expected) + combined = "\n".join(log_messages) + assert "all 3 expected embeddings found" in combined + + def test_logs_warning_when_ids_missing( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + expected = {1, 2, 3, 4, 5} + mock_fn = MagicMock(return_value={1, 3}) + + updater._reconcile_embeddings(expected, mock_fn) + + combined = "\n".join(log_messages) + assert "3 of 5 embeddings missing" in combined + + def test_sample_ids_in_warning( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + expected = {10, 20, 30} + mock_fn = MagicMock(return_value={10}) + + updater._reconcile_embeddings(expected, mock_fn) + + combined = "\n".join(log_messages) + assert "20" in combined + assert "30" in combined + + def test_handles_verify_fn_exception( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + mock_fn = MagicMock(side_effect=RuntimeError("connection lost")) + + updater._reconcile_embeddings({1, 2}, mock_fn) + + combined = "\n".join(log_messages).lower() + assert "reconciliation check failed" in combined + + def test_sample_limited_to_ten( + self, updater: GraphUpdater, log_messages: list[str] + ) -> None: + expected = set(range(20)) + mock_fn = MagicMock(return_value=set()) + + updater._reconcile_embeddings(expected, mock_fn) + + combined = "\n".join(log_messages) + assert "20 of 20 embeddings missing" in combined diff --git a/codebase_rag/tests/test_rust.py b/codebase_rag/tests/test_rust.py index 0751458e6..14f534809 100644 --- a/codebase_rag/tests/test_rust.py +++ b/codebase_rag/tests/test_rust.py @@ -302,25 +302,43 @@ def test_rust_structs_enums_unions( project_name = rust_project.name - expected_classes = [ + expected_structs = [ f"{project_name}.types.Point", f"{project_name}.types.Color", f"{project_name}.types.Unit", f"{project_name}.types.Container", f"{project_name}.types.Borrowed", f"{project_name}.types.GenericBorrowed", + ] + + created_classes = get_node_names(mock_ingestor, "Class") + + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" + ) + + expected_enums = [ f"{project_name}.types.Direction", f"{project_name}.types.Message", f"{project_name}.types.Option", f"{project_name}.types.Cow", + ] + + created_enums = get_node_names(mock_ingestor, "Enum") + + missing_enums = set(expected_enums) - created_enums + assert not missing_enums, f"Missing expected enums: {sorted(list(missing_enums))}" + + expected_unions = [ f"{project_name}.types.FloatOrInt", ] - created_classes = get_node_names(mock_ingestor, "Class") + created_unions = get_node_names(mock_ingestor, "Union") - missing_classes = set(expected_classes) - created_classes - assert not missing_classes, ( - f"Missing expected types: {sorted(list(missing_classes))}" + missing_unions = set(expected_unions) - created_unions + assert not missing_unions, ( + f"Missing expected unions: {sorted(list(missing_unions))}" ) expected_methods = [ @@ -495,6 +513,13 @@ def test_rust_traits_and_implementations( f"{project_name}.traits.Drawable", ] + created_interfaces = get_node_names(mock_ingestor, "Interface") + + missing_traits = set(expected_traits) - created_interfaces + assert not missing_traits, ( + f"Missing expected traits: {sorted(list(missing_traits))}" + ) + expected_structs = [ f"{project_name}.traits.Point", f"{project_name}.traits.Circle", @@ -502,10 +527,9 @@ def test_rust_traits_and_implementations( created_classes = get_node_names(mock_ingestor, "Class") - all_expected = expected_traits + expected_structs - missing_classes = set(all_expected) - created_classes - assert not missing_classes, ( - f"Missing expected traits/structs: {sorted(list(missing_classes))}" + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" ) expected_methods = [ @@ -1059,19 +1083,27 @@ def test_rust_pattern_matching( project_name = rust_project.name - expected_types = [ - f"{project_name}.pattern_matching.Color", - f"{project_name}.pattern_matching.Message", + expected_structs = [ f"{project_name}.pattern_matching.Point", ] created_classes = get_node_names(mock_ingestor, "Class") - found_types = set(expected_types) & created_classes - assert len(found_types) >= 3, ( - f"Expected at least 3 types, found: {sorted(list(found_types))}" + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" ) + expected_enums = [ + f"{project_name}.pattern_matching.Color", + f"{project_name}.pattern_matching.Message", + ] + + created_enums = get_node_names(mock_ingestor, "Enum") + + missing_enums = set(expected_enums) - created_enums + assert not missing_enums, f"Missing expected enums: {sorted(list(missing_enums))}" + expected_functions = [ f"{project_name}.pattern_matching.match_color", f"{project_name}.pattern_matching.match_with_guards", @@ -1535,19 +1567,25 @@ def test_rust_macros( ) expected_structs = [ - f"{project_name}.macros.Person", - f"{project_name}.macros.Point", f"{project_name}.macros.MacroStruct", - f"{project_name}.macros.MacroEnum", ] created_classes = get_node_names(mock_ingestor, "Class") - found_structs = set(expected_structs) & created_classes - assert len(found_structs) >= 2, ( - f"Expected at least 2 macro structs, found: {sorted(list(found_structs))}" + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" ) + expected_enums = [ + f"{project_name}.macros.MacroEnum", + ] + + created_enums = get_node_names(mock_ingestor, "Enum") + + missing_enums = set(expected_enums) - created_enums + assert not missing_enums, f"Missing expected enums: {sorted(list(missing_enums))}" + def test_rust_imports_and_use_statements( rust_project: Path, @@ -2050,9 +2088,9 @@ def test_rust_error_handling( f"{project_name}.error_handling.CustomError", ] - created_classes = get_node_names(mock_ingestor, "Class") + created_enums = get_node_names(mock_ingestor, "Enum") - found_enums = set(expected_enums) & created_classes + found_enums = set(expected_enums) & created_enums assert len(found_enums) >= 1, ( f"Expected at least 1 custom error enum, found: {sorted(list(found_enums))}" ) @@ -2403,18 +2441,36 @@ def test_rust_comprehensive_integration( project_name = rust_project.name - expected_types = [ + expected_structs = [ f"{project_name}.comprehensive.User", - f"{project_name}.comprehensive.RepositoryError", f"{project_name}.comprehensive.UserRepository", - f"{project_name}.comprehensive.Repository", ] created_classes = get_node_names(mock_ingestor, "Class") - found_types = set(expected_types) & created_classes - assert len(found_types) >= 3, ( - f"Expected at least 3 comprehensive types, found: {sorted(list(found_types))}" + missing_structs = set(expected_structs) - created_classes + assert not missing_structs, ( + f"Missing expected structs: {sorted(list(missing_structs))}" + ) + + expected_enums = [ + f"{project_name}.comprehensive.RepositoryError", + ] + + created_enums = get_node_names(mock_ingestor, "Enum") + + missing_enums = set(expected_enums) - created_enums + assert not missing_enums, f"Missing expected enums: {sorted(list(missing_enums))}" + + expected_interfaces = [ + f"{project_name}.comprehensive.Repository", + ] + + created_interfaces = get_node_names(mock_ingestor, "Interface") + + missing_interfaces = set(expected_interfaces) - created_interfaces + assert not missing_interfaces, ( + f"Missing expected traits: {sorted(list(missing_interfaces))}" ) diff --git a/codebase_rag/tests/test_rust_node_type.py b/codebase_rag/tests/test_rust_node_type.py new file mode 100644 index 000000000..edfa95e13 --- /dev/null +++ b/codebase_rag/tests/test_rust_node_type.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag import constants as cs +from codebase_rag.parsers.class_ingest.node_type import determine_node_type +from codebase_rag.tests.conftest import ( + create_mock_node, + get_node_names, + run_updater, +) +from codebase_rag.types_defs import NodeType + + +@pytest.mark.parametrize( + ("ts_node_type", "expected"), + [ + (cs.TS_RS_ENUM_ITEM, NodeType.ENUM), + (cs.TS_RS_TRAIT_ITEM, NodeType.INTERFACE), + (cs.TS_RS_TYPE_ITEM, NodeType.TYPE), + (cs.TS_RS_UNION_ITEM, NodeType.UNION), + (cs.TS_RS_STRUCT_ITEM, NodeType.CLASS), + ], +) +def test_determine_node_type_rust(ts_node_type: str, expected: NodeType) -> None: + node = create_mock_node(ts_node_type) + result = determine_node_type(node, "Foo", "crate::Foo", cs.SupportedLanguage.RUST) + assert result == expected + + +@pytest.fixture +def rust_node_type_project(temp_repo: Path) -> Path: + project_path = temp_repo / "rust_node_type_test" + project_path.mkdir() + (project_path / "Cargo.toml").write_text( + encoding="utf-8", + data='[package]\nname = "rust_node_type_test"\nversion = "0.1.0"\n', + ) + (project_path / "src").mkdir() + (project_path / "src" / "lib.rs").write_text(encoding="utf-8", data="") + (project_path / "types.rs").write_text( + encoding="utf-8", + data=( + "pub enum Color { Red, Green, Blue }\n" + "pub trait Drawable { fn draw(&self); }\n" + "pub type Pair = (i32, i32);\n" + "pub union IntOrFloat { i: i32, f: f32 }\n" + "pub struct Point { pub x: f64, pub y: f64 }\n" + ), + ) + return project_path + + +def test_rust_enum_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + enum_names = get_node_names(mock_ingestor, NodeType.ENUM) + assert len(enum_names) == 1 + assert enum_names.pop().endswith(".Color") + + +def test_rust_trait_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + interface_names = get_node_names(mock_ingestor, NodeType.INTERFACE) + assert len(interface_names) == 1 + assert interface_names.pop().endswith(".Drawable") + + +def test_rust_type_alias_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + type_names = get_node_names(mock_ingestor, NodeType.TYPE) + assert len(type_names) == 1 + assert type_names.pop().endswith(".Pair") + + +def test_rust_union_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + union_names = get_node_names(mock_ingestor, NodeType.UNION) + assert len(union_names) == 1 + assert union_names.pop().endswith(".IntOrFloat") + + +def test_rust_struct_label( + rust_node_type_project: Path, mock_ingestor: MagicMock +) -> None: + run_updater(rust_node_type_project, mock_ingestor, skip_if_missing="rust") + class_names = get_node_names(mock_ingestor, NodeType.CLASS) + assert len(class_names) == 1 + assert class_names.pop().endswith(".Point") diff --git a/codebase_rag/tests/test_shell_command.py b/codebase_rag/tests/test_shell_command.py index f745b2e30..e9b151628 100644 --- a/codebase_rag/tests/test_shell_command.py +++ b/codebase_rag/tests/test_shell_command.py @@ -1,5 +1,6 @@ from __future__ import annotations +import sys from pathlib import Path from unittest.mock import MagicMock @@ -386,6 +387,9 @@ async def test_simple_pipe( assert result.return_code == 0 assert "5" in result.stdout + @pytest.mark.skipif( + sys.platform == "win32", reason="Unix find not available on Windows" + ) async def test_find_with_wc( self, shell_commander: ShellCommander, temp_project_root: Path ) -> None: @@ -398,6 +402,10 @@ async def test_find_with_wc( async def test_rg_in_pipeline( self, shell_commander: ShellCommander, temp_project_root: Path ) -> None: + import shutil + + if not shutil.which("rg"): + pytest.skip("rg (ripgrep) not installed") (temp_project_root / "data.txt").write_text("foo\nbar\nbaz\n", encoding="utf-8") result = await shell_commander.execute("cat data.txt | rg bar") assert result.return_code == 0 @@ -630,11 +638,11 @@ def test_path_outside_project(self, tmp_path: Path) -> None: ["rm", "-rf", "../other"], project_root ) assert is_dangerous - assert "outside project" in reason + assert "outside project" in reason or "system directory" in reason def test_safe_path_inside_project(self, tmp_path: Path) -> None: - project_root = tmp_path / "project" - project_root.mkdir() + project_root = (tmp_path / "project").resolve() + project_root.mkdir(exist_ok=True) is_dangerous, _ = _is_dangerous_rm_path( ["rm", "-rf", "subdir/file.txt"], project_root ) @@ -741,7 +749,8 @@ async def test_rm_outside_project_blocked( ) -> None: result = await shell_commander.execute("rm ../outside_project") assert result.return_code == -1 - assert "outside project" in result.stderr.lower() + stderr_lower = result.stderr.lower() + assert "outside project" in stderr_lower or "system directory" in stderr_lower class TestAwkSedXargsPatterns: diff --git a/codebase_rag/tests/test_single_file_repo_path.py b/codebase_rag/tests/test_single_file_repo_path.py new file mode 100644 index 000000000..71d4a28a7 --- /dev/null +++ b/codebase_rag/tests/test_single_file_repo_path.py @@ -0,0 +1,138 @@ +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from codebase_rag.tests.conftest import ( + get_node_names, + get_relationships, + run_updater, +) + + +@pytest.fixture +def cpp_single_file(temp_repo: Path) -> Path: + test_file = temp_repo / "cmGlobalFastbuildGenerator.cxx" + test_file.write_text( + encoding="utf-8", + data=""" +#include +#include +#include + +static std::map const compilerIdToFastbuildFamily = { + {"GNU", "gcc"}, + {"Clang", "clang"}, +}; + +static std::set const supportedLanguages = { + "C", + "CXX", +}; + +template +T generateAlias(std::string const& name) { return T(); } + +static void helperFunc() {} + +class FastbuildTarget { +public: + void GenerateAliases(); +}; + +void FastbuildTarget::GenerateAliases() { + auto alias = generateAlias("test"); +} + +void freeFunction() { + helperFunc(); +} +""", + ) + return test_file + + +@pytest.fixture +def ran_single_file_updater(cpp_single_file: Path, mock_ingestor: MagicMock) -> None: + from codebase_rag.graph_updater import GraphUpdater + from codebase_rag.parser_loader import load_parsers + + parsers, queries = load_parsers() + updater = GraphUpdater( + ingestor=mock_ingestor, + repo_path=cpp_single_file, + parsers=parsers, + queries=queries, + ) + updater.run() + + +def test_single_file_repo_path_produces_graph( + ran_single_file_updater: None, + mock_ingestor: MagicMock, +) -> None: + functions = get_node_names(mock_ingestor, "Function") + methods = get_node_names(mock_ingestor, "Method") + classes = get_node_names(mock_ingestor, "Class") + + assert any("generateAlias" in qn for qn in functions) + assert any("helperFunc" in qn for qn in functions) + assert any("freeFunction" in qn for qn in functions) + + assert any("GenerateAliases" in qn for qn in methods) + assert any("FastbuildTarget" in qn for qn in classes) + + defines_rels = get_relationships(mock_ingestor, "DEFINES") + assert len(defines_rels) >= 3 + + calls_rels = get_relationships(mock_ingestor, "CALLS") + assert len(calls_rels) >= 1 + + +def test_single_file_repo_path_static_functions( + ran_single_file_updater: None, + mock_ingestor: MagicMock, +) -> None: + functions = get_node_names(mock_ingestor, "Function") + + assert any("helperFunc" in qn for qn in functions), ( + f"Static function helperFunc not found. Functions: {functions}" + ) + + assert any("generateAlias" in qn for qn in functions), ( + f"Template function generateAlias not found. Functions: {functions}" + ) + + +def test_single_file_repo_path_out_of_class_methods( + ran_single_file_updater: None, + mock_ingestor: MagicMock, +) -> None: + methods = get_node_names(mock_ingestor, "Method") + defines_method_rels = get_relationships(mock_ingestor, "DEFINES_METHOD") + + assert any("GenerateAliases" in qn for qn in methods), ( + f"Out-of-class method GenerateAliases not found. Methods: {methods}" + ) + assert len(defines_method_rels) >= 1 + + +def test_directory_repo_path_still_works( + temp_repo: Path, + mock_ingestor: MagicMock, +) -> None: + project = temp_repo / "normal_project" + project.mkdir() + (project / "main.cpp").write_text( + encoding="utf-8", + data=""" +void doStuff() {} +int main() { doStuff(); return 0; } +""", + ) + + run_updater(project, mock_ingestor) + + functions = get_node_names(mock_ingestor, "Function") + assert any("doStuff" in qn for qn in functions) + assert any("main" in qn for qn in functions) diff --git a/codebase_rag/tests/test_slots_and_optimizations.py b/codebase_rag/tests/test_slots_and_optimizations.py new file mode 100644 index 000000000..da8ca621b --- /dev/null +++ b/codebase_rag/tests/test_slots_and_optimizations.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +import pytest + +from codebase_rag.parsers.dependency_parser import ( + CargoTomlParser, + ComposerJsonParser, + CsprojParser, + DependencyParser, + GemfileParser, + GoModParser, + PackageJsonParser, + PyProjectTomlParser, + RequirementsTxtParser, +) +from codebase_rag.parsers.handlers.base import BaseLanguageHandler +from codebase_rag.parsers.handlers.cpp import CppHandler +from codebase_rag.parsers.handlers.java import JavaHandler +from codebase_rag.parsers.handlers.js_ts import JsTsHandler +from codebase_rag.parsers.handlers.lua import LuaHandler +from codebase_rag.parsers.handlers.protocol import LanguageHandler +from codebase_rag.parsers.handlers.python import PythonHandler +from codebase_rag.parsers.handlers.rust import RustHandler +from codebase_rag.parsers.stdlib_extractor import StdlibExtractor +from codebase_rag.parsers.utils import _cached_decode_bytes + + +class TestHandlerSlots: + @pytest.mark.parametrize( + "handler_cls", + [ + BaseLanguageHandler, + PythonHandler, + JavaHandler, + JsTsHandler, + CppHandler, + RustHandler, + LuaHandler, + ], + ) + def test_handler_has_slots(self, handler_cls: type) -> None: + assert hasattr(handler_cls, "__slots__") + + @pytest.mark.parametrize( + "handler_cls", + [ + BaseLanguageHandler, + PythonHandler, + JavaHandler, + JsTsHandler, + CppHandler, + RustHandler, + LuaHandler, + ], + ) + def test_handler_no_instance_dict(self, handler_cls: type) -> None: + instance = handler_cls() + assert not hasattr(instance, "__dict__") + + def test_protocol_has_slots(self) -> None: + assert hasattr(LanguageHandler, "__slots__") + + +class TestDependencyParserSlots: + @pytest.mark.parametrize( + "parser_cls", + [ + DependencyParser, + PyProjectTomlParser, + RequirementsTxtParser, + PackageJsonParser, + CargoTomlParser, + GoModParser, + GemfileParser, + ComposerJsonParser, + CsprojParser, + ], + ) + def test_parser_has_slots(self, parser_cls: type) -> None: + assert hasattr(parser_cls, "__slots__") + + @pytest.mark.parametrize( + "parser_cls", + [ + DependencyParser, + PyProjectTomlParser, + RequirementsTxtParser, + PackageJsonParser, + CargoTomlParser, + GoModParser, + GemfileParser, + ComposerJsonParser, + CsprojParser, + ], + ) + def test_parser_no_instance_dict(self, parser_cls: type) -> None: + instance = parser_cls() + assert not hasattr(instance, "__dict__") + + +class TestStdlibExtractorSlots: + def test_has_slots(self) -> None: + assert hasattr(StdlibExtractor, "__slots__") + assert "function_registry" in StdlibExtractor.__slots__ + assert "repo_path" in StdlibExtractor.__slots__ + assert "project_name" in StdlibExtractor.__slots__ + + def test_no_instance_dict(self) -> None: + extractor = StdlibExtractor() + assert not hasattr(extractor, "__dict__") + + +class TestCachedDecodeBytes: + def test_cache_maxsize(self) -> None: + cache_info = _cached_decode_bytes.cache_info() + assert cache_info.maxsize == 50000 + + def test_decode_bytes(self) -> None: + result = _cached_decode_bytes(b"hello world") + assert result == "hello world" + + def test_decode_caches(self) -> None: + _cached_decode_bytes.cache_clear() + _cached_decode_bytes(b"test_cache") + _cached_decode_bytes(b"test_cache") + info = _cached_decode_bytes.cache_info() + assert info.hits >= 1 diff --git a/codebase_rag/tests/test_slots_lazy_logger.py b/codebase_rag/tests/test_slots_lazy_logger.py new file mode 100644 index 000000000..da306ab09 --- /dev/null +++ b/codebase_rag/tests/test_slots_lazy_logger.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag.graph_loader import GraphLoader +from codebase_rag.providers.base import ( + GoogleProvider, + ModelProvider, + OllamaProvider, + OpenAIProvider, +) +from codebase_rag.services.llm import CypherGenerator +from codebase_rag.tools.code_retrieval import CodeRetriever +from codebase_rag.tools.directory_lister import DirectoryLister +from codebase_rag.tools.document_analyzer import DocumentAnalyzer, _NotSupportedClient +from codebase_rag.tools.file_editor import FileEditor +from codebase_rag.tools.file_reader import FileReader +from codebase_rag.tools.file_writer import FileWriter +from codebase_rag.tools.health_checker import HealthChecker +from codebase_rag.tools.shell_command import CommandGroup, ShellCommander + +REPO_ROOT = Path(__file__).resolve().parent.parent + + +SLOTS_CLASSES: list[tuple[type, tuple[str, ...]]] = [ + (_NotSupportedClient, ()), + (DocumentAnalyzer, ("project_root", "client")), + (FileEditor, ("project_root", "dmp", "parsers")), + (CodeRetriever, ("project_root", "ingestor")), + (FileReader, ("project_root",)), + (FileWriter, ("project_root",)), + (DirectoryLister, ("project_root",)), + (CommandGroup, ("commands", "operator")), + (ShellCommander, ("project_root", "timeout")), + (HealthChecker, ("results",)), + (CypherGenerator, ("agent",)), + (ModelProvider, ("config",)), + ( + GoogleProvider, + ( + "api_key", + "provider_type", + "project_id", + "region", + "service_account_file", + "thinking_budget", + ), + ), + (OpenAIProvider, ("api_key", "endpoint")), + (OllamaProvider, ("endpoint", "api_key")), +] + +GRAPH_LOADER_SLOTS = ( + "file_path", + "_data", + "_nodes", + "_relationships", + "_nodes_by_id", + "_nodes_by_label", + "_outgoing_rels", + "_incoming_rels", + "_property_indexes", +) + + +class TestSlotsPresence: + @pytest.mark.parametrize( + ("cls", "expected_slots"), + SLOTS_CLASSES, + ids=[c.__name__ for c, _ in SLOTS_CLASSES], + ) + def test_class_has_slots(self, cls: type, expected_slots: tuple[str, ...]) -> None: + assert hasattr(cls, "__slots__") + assert set(cls.__slots__) == set(expected_slots) + + def test_graph_loader_has_slots(self) -> None: + assert hasattr(GraphLoader, "__slots__") + assert set(GraphLoader.__slots__) == set(GRAPH_LOADER_SLOTS) + + +class TestSlotsBlockDict: + def test_not_supported_client_no_dict(self) -> None: + obj = _NotSupportedClient() + with pytest.raises(NotImplementedError): + obj.__dict__ + + def test_command_group_no_dict(self) -> None: + obj = CommandGroup(commands=["ls"], operator=None) + assert not hasattr(obj, "__dict__") + + def test_directory_lister_no_dict(self, tmp_path: Path) -> None: + obj = DirectoryLister(str(tmp_path)) + assert not hasattr(obj, "__dict__") + + def test_file_reader_no_dict(self, tmp_path: Path) -> None: + obj = FileReader(str(tmp_path)) + assert not hasattr(obj, "__dict__") + + def test_file_writer_no_dict(self, tmp_path: Path) -> None: + obj = FileWriter(str(tmp_path)) + assert not hasattr(obj, "__dict__") + + def test_health_checker_no_dict(self) -> None: + obj = HealthChecker() + assert not hasattr(obj, "__dict__") + + def test_shell_commander_no_dict(self, tmp_path: Path) -> None: + obj = ShellCommander(str(tmp_path)) + assert not hasattr(obj, "__dict__") + + def test_code_retriever_no_dict(self, tmp_path: Path) -> None: + mock_ingestor = MagicMock() + obj = CodeRetriever(str(tmp_path), mock_ingestor) + assert not hasattr(obj, "__dict__") + + +class TestSlotsRejectArbitraryAttrs: + def test_not_supported_client_rejects_attr(self) -> None: + obj = _NotSupportedClient() + with pytest.raises((AttributeError, NotImplementedError)): + obj.arbitrary = 42 + + def test_command_group_rejects_attr(self) -> None: + obj = CommandGroup(commands=["ls"], operator=None) + with pytest.raises(AttributeError): + obj.arbitrary = 42 + + def test_directory_lister_rejects_attr(self, tmp_path: Path) -> None: + obj = DirectoryLister(str(tmp_path)) + with pytest.raises(AttributeError): + obj.arbitrary = 42 + + def test_health_checker_rejects_attr(self) -> None: + obj = HealthChecker() + with pytest.raises(AttributeError): + obj.arbitrary = 42 + + def test_shell_commander_rejects_attr(self, tmp_path: Path) -> None: + obj = ShellCommander(str(tmp_path)) + with pytest.raises(AttributeError): + obj.arbitrary = 42 + + +LAZY_LOGGER_FILES: list[str] = [ + "parser_loader.py", + "utils/fqn_resolver.py", + "utils/source_extraction.py", + "tools/document_analyzer.py", + "tools/file_editor.py", +] + + +def _find_eager_debug_calls(source: str) -> list[str]: + results = [] + lines = source.split("\n") + i = 0 + while i < len(lines): + line = lines[i] + stripped = line.strip() + if stripped.startswith("logger.debug("): + block = stripped + j = i + paren_count = block.count("(") - block.count(")") + while paren_count > 0 and j + 1 < len(lines): + j += 1 + block += " " + lines[j].strip() + paren_count += lines[j].count("(") - lines[j].count(")") + if ".format(" in block: + results.append(block[:80]) + i = j + 1 + else: + i += 1 + return results + + +class TestLazyLoggerFormat: + @pytest.mark.parametrize("rel_path", LAZY_LOGGER_FILES) + def test_no_eager_debug_format(self, rel_path: str) -> None: + file_path = REPO_ROOT / rel_path + source = file_path.read_text(encoding="utf-8") + eager_calls = _find_eager_debug_calls(source) + assert len(eager_calls) == 0, ( + f"Found {len(eager_calls)} eager logger.debug(.format()) calls in {rel_path}: {eager_calls}" + ) + + +class TestProviderSlotsInheritance: + def test_google_provider_inherits_config_slot(self) -> None: + assert "config" in ModelProvider.__slots__ + assert "config" not in GoogleProvider.__slots__ + + def test_openai_provider_inherits_config_slot(self) -> None: + assert "config" not in OpenAIProvider.__slots__ + + def test_ollama_provider_inherits_config_slot(self) -> None: + assert "config" not in OllamaProvider.__slots__ + + @patch.dict("os.environ", {"GOOGLE_API_KEY": "test-key"}) + def test_google_provider_instance_has_all_attrs(self) -> None: + provider = GoogleProvider(api_key="test-key") + assert provider.api_key == "test-key" + assert provider.config == {} + + def test_openai_provider_instance_has_all_attrs(self) -> None: + provider = OpenAIProvider(api_key="test-key") + assert provider.api_key == "test-key" + assert provider.config == {} + + @patch("codebase_rag.providers.base.settings") + def test_ollama_provider_instance_has_all_attrs( + self, mock_settings: MagicMock + ) -> None: + mock_settings.ollama_endpoint = "http://localhost:11434/v1/" + provider = OllamaProvider() + assert provider.endpoint == "http://localhost:11434/v1/" + assert provider.config == {} diff --git a/codebase_rag/tests/test_source_extraction.py b/codebase_rag/tests/test_source_extraction.py index df7b9099e..9296c91fb 100644 --- a/codebase_rag/tests/test_source_extraction.py +++ b/codebase_rag/tests/test_source_extraction.py @@ -12,7 +12,7 @@ class TestExtractSourceLines: def test_extracts_single_line(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\nline3\n") + file_path.write_bytes(b"line1\nline2\nline3\n") result = extract_source_lines(file_path, 2, 2) @@ -20,7 +20,7 @@ def test_extracts_single_line(self, tmp_path: Path) -> None: def test_extracts_multiple_lines(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\nline3\nline4\n") + file_path.write_bytes(b"line1\nline2\nline3\nline4\n") result = extract_source_lines(file_path, 2, 3) @@ -28,7 +28,7 @@ def test_extracts_multiple_lines(self, tmp_path: Path) -> None: def test_extracts_all_lines(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\nline3\n") + file_path.write_bytes(b"line1\nline2\nline3\n") result = extract_source_lines(file_path, 1, 3) @@ -36,7 +36,7 @@ def test_extracts_all_lines(self, tmp_path: Path) -> None: def test_strips_trailing_whitespace(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data=" code \n more \n") + file_path.write_bytes(b" code \n more \n") result = extract_source_lines(file_path, 1, 2) @@ -51,7 +51,7 @@ def test_returns_none_for_nonexistent_file(self, tmp_path: Path) -> None: def test_returns_none_for_zero_start_line(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\n") + file_path.write_bytes(b"line1\n") result = extract_source_lines(file_path, 0, 1) @@ -59,7 +59,7 @@ def test_returns_none_for_zero_start_line(self, tmp_path: Path) -> None: def test_returns_none_for_negative_start_line(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\n") + file_path.write_bytes(b"line1\n") result = extract_source_lines(file_path, -1, 1) @@ -67,7 +67,7 @@ def test_returns_none_for_negative_start_line(self, tmp_path: Path) -> None: def test_returns_none_for_zero_end_line(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\n") + file_path.write_bytes(b"line1\n") result = extract_source_lines(file_path, 1, 0) @@ -75,7 +75,7 @@ def test_returns_none_for_zero_end_line(self, tmp_path: Path) -> None: def test_returns_none_for_start_greater_than_end(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_lines(file_path, 2, 1) @@ -83,23 +83,23 @@ def test_returns_none_for_start_greater_than_end(self, tmp_path: Path) -> None: def test_returns_none_when_start_exceeds_file_length(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_lines(file_path, 5, 6) assert result is None - def test_returns_none_when_end_exceeds_file_length(self, tmp_path: Path) -> None: + def test_clamps_when_end_exceeds_file_length(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_lines(file_path, 1, 10) - assert result is None + assert result == "line1\nline2" def test_handles_empty_file(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="") + file_path.write_bytes(b"") result = extract_source_lines(file_path, 1, 1) @@ -107,17 +107,61 @@ def test_handles_empty_file(self, tmp_path: Path) -> None: def test_preserves_indentation(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="def func():\n return 42\n") + file_path.write_bytes(b"def func():\n return 42\n") result = extract_source_lines(file_path, 1, 2) assert result == "def func():\n return 42" + def test_counts_blank_lines(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes(b"line1\n\nline3\n\nline5\n") + + result = extract_source_lines(file_path, 1, 5) + + assert result == "line1\n\nline3\n\nline5" + + def test_extracts_across_blank_lines(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes( + b"def func1():\n pass\n\ndef func2():\n return 42\n" + ) + + result = extract_source_lines(file_path, 4, 5) + + assert result == "def func2():\n return 42" + + def test_preserves_internal_blank_lines(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes( + b"def func():\n x = 1\n\n y = 2\n\n return x + y\n" + ) + + result = extract_source_lines(file_path, 1, 6) + + assert result == "def func():\n x = 1\n\n y = 2\n\n return x + y" + + def test_line_count_matches_with_many_blank_lines(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes(b"a\n\n\n\nb\n\n\n\nc\n") + + result = extract_source_lines(file_path, 5, 5) + + assert result == "b" + + def test_clamps_end_line_returns_partial_content(self, tmp_path: Path) -> None: + file_path = tmp_path / "test.py" + file_path.write_bytes(b"def func():\n pass\n\ndef other():\n return 1\n") + + result = extract_source_lines(file_path, 4, 100) + + assert result == "def other():\n return 1" + class TestExtractSourceWithFallback: def test_uses_line_extraction_when_no_ast_extractor(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_with_fallback(file_path, 1, 2) @@ -125,7 +169,7 @@ def test_uses_line_extraction_when_no_ast_extractor(self, tmp_path: Path) -> Non def test_uses_ast_extractor_when_provided(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") def mock_ast_extractor(name: str, path: Path) -> str: return f"AST result for {name}" @@ -140,7 +184,7 @@ def test_falls_back_to_lines_when_ast_extractor_returns_none( self, tmp_path: Path ) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") def mock_ast_extractor(name: str, path: Path) -> None: return None @@ -155,7 +199,7 @@ def test_falls_back_to_lines_when_ast_extractor_raises( self, tmp_path: Path ) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") def mock_ast_extractor(name: str, path: Path) -> str: raise RuntimeError("AST extraction failed") @@ -168,7 +212,7 @@ def mock_ast_extractor(name: str, path: Path) -> str: def test_skips_ast_when_qualified_name_is_none(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") ast_called = False def mock_ast_extractor(name: str, path: Path) -> str: @@ -185,7 +229,7 @@ def mock_ast_extractor(name: str, path: Path) -> str: def test_skips_ast_when_extractor_is_none(self, tmp_path: Path) -> None: file_path = tmp_path / "test.py" - file_path.write_text(encoding="utf-8", data="line1\nline2\n") + file_path.write_bytes(b"line1\nline2\n") result = extract_source_with_fallback( file_path, 1, 2, qualified_name="my.func", ast_extractor=None diff --git a/codebase_rag/tests/test_stats_command.py b/codebase_rag/tests/test_stats_command.py new file mode 100644 index 000000000..6e86f251b --- /dev/null +++ b/codebase_rag/tests/test_stats_command.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest +from typer.testing import CliRunner + +from codebase_rag.cli import app +from codebase_rag.types_defs import ResultRow + + +@pytest.fixture +def runner() -> CliRunner: + return CliRunner() + + +@pytest.fixture +def mock_node_results() -> list[ResultRow]: + return [ + {"labels": ["Function"], "count": 100}, + {"labels": ["Class"], "count": 50}, + {"labels": ["Module"], "count": 30}, + ] + + +@pytest.fixture +def mock_rel_results() -> list[ResultRow]: + return [ + {"type": "CALLS", "count": 200}, + {"type": "DEFINES", "count": 80}, + ] + + +def _make_mock_ingestor(*fetch_side_effects: list[ResultRow]) -> MagicMock: + mock = MagicMock() + mock.fetch_all.side_effect = list(fetch_side_effects) + mock.__enter__ = MagicMock(return_value=mock) + mock.__exit__ = MagicMock(return_value=False) + return mock + + +class TestStatsCommand: + def test_stats_displays_node_table( + self, + runner: CliRunner, + mock_node_results: list[ResultRow], + mock_rel_results: list[ResultRow], + ) -> None: + mock_ingestor = _make_mock_ingestor(mock_node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "Function" in result.output + assert "Class" in result.output + assert "Module" in result.output + + def test_stats_displays_relationship_table( + self, + runner: CliRunner, + mock_node_results: list[ResultRow], + mock_rel_results: list[ResultRow], + ) -> None: + mock_ingestor = _make_mock_ingestor(mock_node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "CALLS" in result.output + assert "DEFINES" in result.output + + def test_stats_displays_totals( + self, + runner: CliRunner, + mock_node_results: list[ResultRow], + mock_rel_results: list[ResultRow], + ) -> None: + mock_ingestor = _make_mock_ingestor(mock_node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "180" in result.output + assert "280" in result.output + + def test_stats_handles_empty_graph( + self, + runner: CliRunner, + ) -> None: + mock_ingestor = _make_mock_ingestor([], []) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "0" in result.output + + def test_stats_handles_connection_error( + self, + runner: CliRunner, + ) -> None: + with patch( + "codebase_rag.cli.connect_memgraph", + side_effect=ConnectionError("Cannot connect"), + ): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 1 + assert "Failed" in result.output + + def test_stats_handles_multi_label_nodes( + self, + runner: CliRunner, + mock_rel_results: list[ResultRow], + ) -> None: + node_results: list[ResultRow] = [ + {"labels": ["Function", "Exported"], "count": 10}, + ] + mock_ingestor = _make_mock_ingestor(node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "Function:Exported" in result.output + + def test_stats_handles_empty_labels( + self, + runner: CliRunner, + mock_rel_results: list[ResultRow], + ) -> None: + node_results: list[ResultRow] = [ + {"labels": [], "count": 5}, + ] + mock_ingestor = _make_mock_ingestor(node_results, mock_rel_results) + with patch("codebase_rag.cli.connect_memgraph", return_value=mock_ingestor): + result = runner.invoke(app, ["stats"]) + + assert result.exit_code == 0 + assert "Unknown" in result.output diff --git a/codebase_rag/tests/test_stdlib_extractor.py b/codebase_rag/tests/test_stdlib_extractor.py index bd09b0244..723650741 100644 --- a/codebase_rag/tests/test_stdlib_extractor.py +++ b/codebase_rag/tests/test_stdlib_extractor.py @@ -306,7 +306,7 @@ def test_js_stdlib_lowercase_entity_without_node( "fs.readFile", cs.SupportedLanguage.JS ) - assert result == "fs.readFile" + assert result == "fs" def test_ts_uses_js_extraction_uppercase(self, extractor: StdlibExtractor) -> None: with patch.object(se, "_is_tool_available", return_value=False): @@ -314,11 +314,11 @@ def test_ts_uses_js_extraction_uppercase(self, extractor: StdlibExtractor) -> No assert result == "path" - def test_ts_lowercase_returns_unchanged(self, extractor: StdlibExtractor) -> None: + def test_ts_lowercase_strips_entity(self, extractor: StdlibExtractor) -> None: with patch.object(se, "_is_tool_available", return_value=False): result = extractor.extract_module_path("path.join", cs.SupportedLanguage.TS) - assert result == "path.join" + assert result == "path" class TestEdgeCases: @@ -704,7 +704,7 @@ def test_js_extractor_fallback_on_entity_not_found( "fs.nonexistent", cs.SupportedLanguage.JS ) - assert result == "fs.nonexistent" + assert result == "fs" def test_js_extractor_fallback_on_json_decode_error( self, extractor: StdlibExtractor @@ -719,7 +719,7 @@ def test_js_extractor_fallback_on_json_decode_error( ): result = extractor.extract_module_path("path.join", cs.SupportedLanguage.JS) - assert result == "path.join" + assert result == "path" def test_js_extractor_fallback_on_timeout(self, extractor: StdlibExtractor) -> None: import subprocess @@ -732,4 +732,4 @@ def test_js_extractor_fallback_on_timeout(self, extractor: StdlibExtractor) -> N "http.createServer", cs.SupportedLanguage.JS ) - assert result == "http.createServer" + assert result == "http" diff --git a/codebase_rag/tests/test_structure_processor.py b/codebase_rag/tests/test_structure_processor.py index 51c23fe60..50c74ea2c 100644 --- a/codebase_rag/tests/test_structure_processor.py +++ b/codebase_rag/tests/test_structure_processor.py @@ -511,3 +511,22 @@ def test_multiple_package_indicators( ] qualified_names = {c[0][1]["qualified_name"] for c in package_calls} assert qualified_names == {"multi_lang.pypkg", "multi_lang.rustpkg"} + + +class TestStructureProcessorSlots: + def test_has_slots(self) -> None: + assert hasattr(StructureProcessor, "__slots__") + + def test_no_instance_dict(self, processor: StructureProcessor) -> None: + assert not hasattr(processor, "__dict__") + + def test_rejects_arbitrary_attribute(self, processor: StructureProcessor) -> None: + with pytest.raises(AttributeError): + processor.nonexistent_attr = 42 + + def test_slot_attributes_accessible(self, processor: StructureProcessor) -> None: + assert hasattr(processor, "ingestor") + assert hasattr(processor, "repo_path") + assert hasattr(processor, "project_name") + assert hasattr(processor, "queries") + assert hasattr(processor, "structural_elements") diff --git a/codebase_rag/tests/test_token_utils.py b/codebase_rag/tests/test_token_utils.py new file mode 100644 index 000000000..bbd116c13 --- /dev/null +++ b/codebase_rag/tests/test_token_utils.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from codebase_rag.types_defs import ResultRow +from codebase_rag.utils.token_utils import count_tokens, truncate_results_by_tokens + + +class TestCountTokens: + def test_empty_string(self) -> None: + assert count_tokens("") == 0 + + def test_simple_string(self) -> None: + tokens = count_tokens("hello world") + assert tokens > 0 + + def test_longer_string_has_more_tokens(self) -> None: + short = count_tokens("hello") + long = count_tokens("hello world this is a longer string with more tokens") + assert long > short + + +class TestTruncateResultsByTokens: + def test_empty_results(self) -> None: + results, tokens, truncated = truncate_results_by_tokens([], max_tokens=1000) + assert results == [] + assert tokens == 0 + assert truncated is False + + def test_results_within_limit(self) -> None: + rows: list[ResultRow] = [ + {"name": "foo", "count": 1}, + {"name": "bar", "count": 2}, + ] + results, tokens, truncated = truncate_results_by_tokens(rows, max_tokens=10000) + assert len(results) == 2 + assert tokens > 0 + assert truncated is False + + def test_results_exceed_limit(self) -> None: + rows: list[ResultRow] = [ + {"name": f"function_{i}", "path": f"src/module_{i}/file_{i}.py"} + for i in range(100) + ] + results, tokens, truncated = truncate_results_by_tokens(rows, max_tokens=200) + assert len(results) < 100 + assert len(results) > 0 + assert tokens <= 200 + assert truncated is True + + def test_single_large_row_still_included(self) -> None: + rows: list[ResultRow] = [ + {"content": "x" * 5000}, + ] + results, tokens, truncated = truncate_results_by_tokens(rows, max_tokens=10) + assert len(results) == 1 + assert truncated is False + + def test_preserves_row_order(self) -> None: + rows: list[ResultRow] = [ + {"name": "first"}, + {"name": "second"}, + {"name": "third"}, + ] + results, _, _ = truncate_results_by_tokens(rows, max_tokens=10000) + assert [r["name"] for r in results] == ["first", "second", "third"] + + def test_token_count_accuracy(self) -> None: + rows: list[ResultRow] = [ + {"name": "hello world"}, + ] + results, tokens, _ = truncate_results_by_tokens(rows, max_tokens=10000) + assert tokens == count_tokens('{"name": "hello world"}') diff --git a/codebase_rag/tests/test_type_inference_iterative.py b/codebase_rag/tests/test_type_inference_iterative.py index 76d0febeb..62ac84dbd 100644 --- a/codebase_rag/tests/test_type_inference_iterative.py +++ b/codebase_rag/tests/test_type_inference_iterative.py @@ -3,7 +3,7 @@ from collections import defaultdict from pathlib import Path from typing import Any -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -88,15 +88,16 @@ def test_analyze_self_assignments_handles_deep_tree_without_recursion_error() -> engine = _make_engine() py_engine = engine.python_type_inference - py_engine._infer_type_from_expression = MagicMock(return_value="MockType") # type: ignore[method-assign] + mock_infer = MagicMock(return_value="MockType") root = _build_deep_assignment_chain(depth=1500) local_types: dict[str, Any] = {} - py_engine._analyze_self_assignments(root, local_types, "proj.module") # ty: ignore[invalid-argument-type] # (H) NodeStub not Node + with patch.object(type(py_engine), "_infer_type_from_expression", mock_infer): + py_engine._analyze_self_assignments(root, local_types, "proj.module") # ty: ignore[invalid-argument-type] # (H) NodeStub not Node assert local_types, "Expected at least one inferred instance variable" - assert py_engine._infer_type_from_expression.call_count == 1500 # type: ignore[attr-defined] + assert mock_infer.call_count == 1500 def test_find_return_statements_handles_deep_tree_without_recursion_error() -> None: @@ -162,86 +163,91 @@ def test_dispatches_to_python_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"var1": "str"} - engine.python_type_inference.build_local_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.PYTHON - ) + with patch.object( + PythonTypeInferenceEngine, + "build_local_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.PYTHON + ) assert result == expected - engine.python_type_inference.build_local_variable_type_map.assert_called_once_with( - mock_node, "proj.module" - ) + mock_method.assert_called_once_with(mock_node, "proj.module") def test_dispatches_to_js_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"jsVar": "number"} - engine.js_type_inference.build_local_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.JS - ) + with patch.object( + JsTypeInferenceEngine, + "build_local_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.JS + ) assert result == expected - engine.js_type_inference.build_local_variable_type_map.assert_called_once_with( - mock_node, "proj.module" - ) + mock_method.assert_called_once_with(mock_node, "proj.module") def test_dispatches_to_ts_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"tsVar": "string"} - engine.js_type_inference.build_local_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.TS - ) + with patch.object( + JsTypeInferenceEngine, + "build_local_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.TS + ) assert result == expected - engine.js_type_inference.build_local_variable_type_map.assert_called_once_with( - mock_node, "proj.module" - ) + mock_method.assert_called_once_with(mock_node, "proj.module") def test_dispatches_to_java_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"javaVar": "String"} - engine.java_type_inference.build_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.JAVA - ) + with patch.object( + JavaTypeInferenceEngine, + "build_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.JAVA + ) assert result == expected - engine.java_type_inference.build_variable_type_map.assert_called_once_with( - mock_node, "proj.module" - ) + mock_method.assert_called_once_with(mock_node, "proj.module") def test_dispatches_to_lua_engine( self, engine: TypeInferenceEngine, mock_node: MagicMock ) -> None: expected = {"luaVar": "table"} - engine.lua_type_inference.build_local_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine.build_local_variable_type_map( - mock_node, "proj.module", cs.SupportedLanguage.LUA - ) + with patch.object( + LuaTypeInferenceEngine, + "build_local_variable_type_map", + mock_method, + ): + result = engine.build_local_variable_type_map( + mock_node, "proj.module", cs.SupportedLanguage.LUA + ) assert result == expected - engine.lua_type_inference.build_local_variable_type_map.assert_called_once_with( - mock_node, "proj.module" - ) + mock_method.assert_called_once_with(mock_node, "proj.module") @pytest.mark.parametrize( "language", @@ -320,13 +326,16 @@ def test_delegates_to_java_engine(self) -> None: engine = _make_engine() mock_node = MagicMock() expected = {"javaVar": "String", "count": "int"} - engine.java_type_inference.build_variable_type_map = MagicMock( - return_value=expected - ) + mock_method = MagicMock(return_value=expected) - result = engine._build_java_variable_type_map(mock_node, "com.example.Module") + with patch.object( + JavaTypeInferenceEngine, + "build_variable_type_map", + mock_method, + ): + result = engine._build_java_variable_type_map( + mock_node, "com.example.Module" + ) assert result == expected - engine.java_type_inference.build_variable_type_map.assert_called_once_with( - mock_node, "com.example.Module" - ) + mock_method.assert_called_once_with(mock_node, "com.example.Module") diff --git a/codebase_rag/tests/test_vector_store_batch.py b/codebase_rag/tests/test_vector_store_batch.py new file mode 100644 index 000000000..597ebd2d2 --- /dev/null +++ b/codebase_rag/tests/test_vector_store_batch.py @@ -0,0 +1,225 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag.utils.dependencies import has_qdrant_client + +pytestmark = pytest.mark.skipif( + not has_qdrant_client(), reason="qdrant-client not installed" +) + +_PATCH_CLIENT = "codebase_rag.vector_store.get_qdrant_client" +_PATCH_SLEEP = "codebase_rag.vector_store.time.sleep" + + +class TestUpsertWithRetry: + def test_succeeds_on_first_attempt(self) -> None: + from codebase_rag.vector_store import _upsert_with_retry + + mock_client = MagicMock() + mock_point = MagicMock() + + with patch(_PATCH_CLIENT, return_value=mock_client): + _upsert_with_retry([mock_point]) + + mock_client.upsert.assert_called_once() + + def test_retries_on_failure_then_succeeds(self) -> None: + from codebase_rag.vector_store import _upsert_with_retry + + mock_client = MagicMock() + mock_client.upsert.side_effect = [ + ConnectionError("timeout"), + None, + ] + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + patch(_PATCH_SLEEP) as mock_sleep, + ): + _upsert_with_retry([MagicMock()]) + + assert mock_client.upsert.call_count == 2 + mock_sleep.assert_called_once() + + def test_raises_after_exhausting_retries(self) -> None: + from codebase_rag.vector_store import _upsert_with_retry + + mock_client = MagicMock() + mock_client.upsert.side_effect = ConnectionError("timeout") + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + patch(_PATCH_SLEEP), + pytest.raises(ConnectionError, match="timeout"), + ): + _upsert_with_retry([MagicMock()]) + + def test_exponential_backoff_delays(self) -> None: + from codebase_rag.vector_store import _upsert_with_retry + + mock_client = MagicMock() + mock_client.upsert.side_effect = [ + ConnectionError("fail"), + ConnectionError("fail"), + None, + ] + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + patch(_PATCH_SLEEP) as mock_sleep, + ): + _upsert_with_retry([MagicMock()]) + + delays = [c.args[0] for c in mock_sleep.call_args_list] + assert delays[1] > delays[0] + + +class TestStoreEmbeddingBatch: + def test_returns_count_on_success(self) -> None: + from codebase_rag.vector_store import store_embedding_batch + + mock_client = MagicMock() + points = [ + (1, [0.1] * 768, "mod.func1"), + (2, [0.2] * 768, "mod.func2"), + ] + + with patch(_PATCH_CLIENT, return_value=mock_client): + result = store_embedding_batch(points) + + assert result == 2 + + def test_returns_zero_on_empty(self) -> None: + from codebase_rag.vector_store import store_embedding_batch + + result = store_embedding_batch([]) + assert result == 0 + + def test_returns_zero_on_failure(self) -> None: + from codebase_rag.vector_store import store_embedding_batch + + mock_client = MagicMock() + mock_client.upsert.side_effect = Exception("fail") + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + patch(_PATCH_SLEEP), + ): + result = store_embedding_batch([(1, [0.1] * 768, "mod.func")]) + + assert result == 0 + + def test_builds_correct_point_structs(self) -> None: + from codebase_rag.vector_store import store_embedding_batch + + mock_client = MagicMock() + embedding = [0.5] * 768 + points = [(42, embedding, "pkg.module.fn")] + + with patch(_PATCH_CLIENT, return_value=mock_client): + store_embedding_batch(points) + + call_kwargs = mock_client.upsert.call_args[1] + stored_points = call_kwargs["points"] + assert len(stored_points) == 1 + assert stored_points[0].id == 42 + assert stored_points[0].vector == embedding + assert stored_points[0].payload["node_id"] == 42 + assert stored_points[0].payload["qualified_name"] == "pkg.module.fn" + + +class TestDeleteProjectEmbeddings: + def test_deletes_given_ids(self) -> None: + from codebase_rag.vector_store import delete_project_embeddings + + mock_client = MagicMock() + node_ids = [1, 2, 3] + + with patch(_PATCH_CLIENT, return_value=mock_client): + delete_project_embeddings("myproject", node_ids) + + mock_client.delete.assert_called_once() + call_kwargs = mock_client.delete.call_args[1] + assert call_kwargs["points_selector"] == [1, 2, 3] + + def test_noop_on_empty_ids(self) -> None: + from codebase_rag.vector_store import delete_project_embeddings + + mock_client = MagicMock() + + with patch(_PATCH_CLIENT, return_value=mock_client): + delete_project_embeddings("myproject", []) + + mock_client.delete.assert_not_called() + + def test_handles_exception_gracefully(self) -> None: + from codebase_rag.vector_store import delete_project_embeddings + + mock_client = MagicMock() + mock_client.delete.side_effect = Exception("connection lost") + + with patch(_PATCH_CLIENT, return_value=mock_client): + delete_project_embeddings("myproject", [1, 2]) + + +class TestVerifyStoredIds: + def test_returns_found_ids(self) -> None: + from codebase_rag.vector_store import verify_stored_ids + + mock_client = MagicMock() + mock_point_1 = MagicMock() + mock_point_1.id = 1 + mock_point_2 = MagicMock() + mock_point_2.id = 3 + mock_client.retrieve.return_value = [mock_point_1, mock_point_2] + + with patch(_PATCH_CLIENT, return_value=mock_client): + result = verify_stored_ids({1, 2, 3}) + + assert result == {1, 3} + + def test_returns_empty_for_empty_input(self) -> None: + from codebase_rag.vector_store import verify_stored_ids + + result = verify_stored_ids(set()) + assert result == set() + + def test_raises_on_exception(self) -> None: + from codebase_rag.vector_store import verify_stored_ids + + mock_client = MagicMock() + mock_client.retrieve.side_effect = Exception("fail") + + with ( + patch(_PATCH_CLIENT, return_value=mock_client), + pytest.raises(Exception, match="fail"), + ): + verify_stored_ids({1, 2}) + + def test_batches_large_id_sets(self) -> None: + from codebase_rag.vector_store import _RETRIEVE_BATCH_SIZE, verify_stored_ids + + mock_client = MagicMock() + mock_client.retrieve.return_value = [] + + large_id_set = set(range(_RETRIEVE_BATCH_SIZE + 100)) + + with patch(_PATCH_CLIENT, return_value=mock_client): + verify_stored_ids(large_id_set) + + assert mock_client.retrieve.call_count == 2 + + def test_retrieve_called_with_correct_params(self) -> None: + from codebase_rag.vector_store import verify_stored_ids + + mock_client = MagicMock() + mock_client.retrieve.return_value = [] + + with patch(_PATCH_CLIENT, return_value=mock_client): + verify_stored_ids({10, 20}) + + call_kwargs = mock_client.retrieve.call_args[1] + assert call_kwargs["with_payload"] is False + assert call_kwargs["with_vectors"] is False + assert set(call_kwargs["ids"]) == {10, 20} diff --git a/codebase_rag/tool_errors.py b/codebase_rag/tool_errors.py index 25540a976..81ead3459 100644 --- a/codebase_rag/tool_errors.py +++ b/codebase_rag/tool_errors.py @@ -34,6 +34,10 @@ DIRECTORY_INVALID = "Error: '{path}' is not a valid directory." DIRECTORY_EMPTY = "Error: The directory '{path}' is empty." DIRECTORY_LIST_FAILED = "Error: Could not list contents of '{path}'." +DIRECTORY_PATH_OUTSIDE_ROOT = ( + "Error: '{path}' is outside the project root ({root}). " + "Use a relative path from the project root, or the full absolute path within it." +) # (H) Shell command errors COMMAND_NOT_ALLOWED = "Command '{cmd}' is not in the allowlist.{suggestion} Available commands: {available}" @@ -69,3 +73,4 @@ # (H) CLI validation errors INVALID_POSITIVE_INT = "{value!r} is not a valid positive integer" +INVALID_NON_NEGATIVE_FLOAT = "Value must be non-negative, got {value}" diff --git a/codebase_rag/tools/code_retrieval.py b/codebase_rag/tools/code_retrieval.py index 2e6331dcd..bd04cce0a 100644 --- a/codebase_rag/tools/code_retrieval.py +++ b/codebase_rag/tools/code_retrieval.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio from pathlib import Path from loguru import logger @@ -15,6 +16,8 @@ class CodeRetriever: + __slots__ = ("project_root", "ingestor") + def __init__(self, project_root: str, ingestor: QueryProtocol): self.project_root = Path(project_root).resolve() self.ingestor = ingestor @@ -25,7 +28,9 @@ async def find_code_snippet(self, qualified_name: str) -> CodeSnippet: params = {"qn": qualified_name} try: - results = self.ingestor.fetch_all(CYPHER_FIND_BY_QUALIFIED_NAME, params) + results = await asyncio.to_thread( + self.ingestor.fetch_all, CYPHER_FIND_BY_QUALIFIED_NAME, params + ) if not results: return CodeSnippet( diff --git a/codebase_rag/tools/codebase_query.py b/codebase_rag/tools/codebase_query.py index 690a979bb..cf4e73e51 100644 --- a/codebase_rag/tools/codebase_query.py +++ b/codebase_rag/tools/codebase_query.py @@ -1,5 +1,7 @@ from __future__ import annotations +import asyncio + from loguru import logger from pydantic_ai import Tool from rich.console import Console @@ -8,16 +10,19 @@ from .. import exceptions as ex from .. import logs as ls +from ..config import settings from ..constants import ( QUERY_NOT_AVAILABLE, QUERY_RESULTS_PANEL_TITLE, QUERY_SUMMARY_DB_ERROR, QUERY_SUMMARY_SUCCESS, QUERY_SUMMARY_TRANSLATION_FAILED, + QUERY_SUMMARY_TRUNCATED, ) from ..schemas import QueryGraphData from ..services import QueryProtocol from ..services.llm import CypherGenerator +from ..utils.token_utils import truncate_results_by_tokens from . import tool_descriptions as td @@ -27,7 +32,7 @@ def create_query_tool( console: Console | None = None, ) -> Tool: if console is None: - console = Console(width=None, force_terminal=True) + console = Console(width=None, stderr=True, force_terminal=True) async def query_codebase_knowledge_graph( natural_language_query: str, @@ -37,7 +42,17 @@ async def query_codebase_knowledge_graph( try: cypher_query = await cypher_gen.generate(natural_language_query) - results = ingestor.fetch_all(cypher_query) + results = await asyncio.to_thread(ingestor.fetch_all, cypher_query) + + total_count = len(results) + if total_count > settings.QUERY_RESULT_ROW_CAP: + results = results[: settings.QUERY_RESULT_ROW_CAP] + + results, tokens_used, was_truncated = truncate_results_by_tokens( + results, + max_tokens=settings.QUERY_RESULT_MAX_TOKENS, + original_total=total_count, + ) if results: table = Table( @@ -69,7 +84,15 @@ async def query_codebase_knowledge_graph( ) ) - summary = QUERY_SUMMARY_SUCCESS.format(count=len(results)) + if was_truncated or total_count > len(results): + summary = QUERY_SUMMARY_TRUNCATED.format( + kept=len(results), + total=total_count, + tokens=tokens_used, + max_tokens=settings.QUERY_RESULT_MAX_TOKENS, + ) + else: + summary = QUERY_SUMMARY_SUCCESS.format(count=len(results)) return QueryGraphData( query_used=cypher_query, results=results, summary=summary ) diff --git a/codebase_rag/tools/directory_lister.py b/codebase_rag/tools/directory_lister.py index 01136a193..92afcb920 100644 --- a/codebase_rag/tools/directory_lister.py +++ b/codebase_rag/tools/directory_lister.py @@ -13,11 +13,19 @@ class DirectoryLister: + __slots__ = ("project_root",) + def __init__(self, project_root: str): self.project_root = Path(project_root).resolve() def list_directory_contents(self, directory_path: str) -> str: - target_path = self._get_safe_path(directory_path) + try: + target_path = self._get_safe_path(directory_path) + except PermissionError: + return te.DIRECTORY_PATH_OUTSIDE_ROOT.format( + path=directory_path, root=self.project_root + ) + logger.info(ls.DIR_LISTING.format(path=target_path)) try: diff --git a/codebase_rag/tools/document_analyzer.py b/codebase_rag/tools/document_analyzer.py index 2a5475954..1c368aeed 100644 --- a/codebase_rag/tools/document_analyzer.py +++ b/codebase_rag/tools/document_analyzer.py @@ -21,11 +21,15 @@ class _NotSupportedClient: + __slots__ = () + def __getattr__(self, name: str) -> NoReturn: raise NotImplementedError(ex.DOC_UNSUPPORTED_PROVIDER) class DocumentAnalyzer: + __slots__ = ("project_root", "client") + def __init__(self, project_root: str) -> None: self.project_root = Path(project_root).resolve() @@ -35,6 +39,8 @@ def __init__(self, project_root: str) -> None: if orchestrator_provider == cs.Provider.GOOGLE: if orchestrator_config.provider_type == cs.GoogleProviderType.VERTEX: self.client = genai.Client( + vertexai=True, + credentials=orchestrator_config.service_account_file, project=orchestrator_config.project_id, location=orchestrator_config.region, ) @@ -150,9 +156,7 @@ def analyze_document(file_path: str, question: str) -> str: try: result = analyzer.analyze(file_path, question) preview = result[:100] if result else "None" - logger.debug( - ls.DOC_RESULT.format(type=type(result).__name__, preview=preview) - ) + logger.debug(ls.DOC_RESULT, type=type(result).__name__, preview=preview) return result except Exception as e: logger.exception(ls.DOC_EXCEPTION.format(error=e)) diff --git a/codebase_rag/tools/file_editor.py b/codebase_rag/tools/file_editor.py index 650da823e..bc79ce8e0 100644 --- a/codebase_rag/tools/file_editor.py +++ b/codebase_rag/tools/file_editor.py @@ -20,6 +20,8 @@ class FileEditor: + __slots__ = ("project_root", "dmp", "parsers") + def __init__(self, project_root: str = ".") -> None: self.project_root = Path(project_root).resolve() self.dmp = diff_match_patch.diff_match_patch() @@ -218,7 +220,7 @@ def replace_code_block( if target_block not in original_content: logger.error(ls.EDITOR_BLOCK_NOT_FOUND.format(path=file_path)) - logger.debug(ls.EDITOR_LOOKING_FOR.format(block=repr(target_block))) + logger.debug(ls.EDITOR_LOOKING_FOR, block=repr(target_block)) return False modified_content = original_content.replace( diff --git a/codebase_rag/tools/file_reader.py b/codebase_rag/tools/file_reader.py index 1b5f8618b..ae471ee93 100644 --- a/codebase_rag/tools/file_reader.py +++ b/codebase_rag/tools/file_reader.py @@ -14,6 +14,8 @@ class FileReader: + __slots__ = ("project_root",) + def __init__(self, project_root: str = "."): self.project_root = Path(project_root).resolve() logger.info(ls.FILE_READER_INIT.format(root=self.project_root)) diff --git a/codebase_rag/tools/file_writer.py b/codebase_rag/tools/file_writer.py index 4f3110b3b..ca709778a 100644 --- a/codebase_rag/tools/file_writer.py +++ b/codebase_rag/tools/file_writer.py @@ -14,6 +14,8 @@ class FileWriter: + __slots__ = ("project_root",) + def __init__(self, project_root: str = "."): self.project_root = Path(project_root).resolve() logger.info(ls.FILE_WRITER_INIT.format(root=self.project_root)) diff --git a/codebase_rag/tools/health_checker.py b/codebase_rag/tools/health_checker.py index 2b94f2c6f..36640b5e1 100644 --- a/codebase_rag/tools/health_checker.py +++ b/codebase_rag/tools/health_checker.py @@ -12,6 +12,8 @@ class HealthChecker: + __slots__ = ("results",) + def __init__(self): self.results: list[HealthCheckResult] = [] diff --git a/codebase_rag/tools/semantic_search.py b/codebase_rag/tools/semantic_search.py index e7aa9c5b2..d647ce20e 100644 --- a/codebase_rag/tools/semantic_search.py +++ b/codebase_rag/tools/semantic_search.py @@ -139,7 +139,11 @@ async def semantic_search_functions(query: str, top_k: int = 5) -> str: return response - return Tool(semantic_search_functions, name=td.AgenticToolName.SEMANTIC_SEARCH) + return Tool( + semantic_search_functions, + name=td.AgenticToolName.SEMANTIC_SEARCH, + description=td.SEMANTIC_SEARCH, + ) def create_get_function_source_tool() -> Tool: @@ -153,4 +157,8 @@ async def get_function_source_by_id(node_id: int) -> str: return cs.MSG_SEMANTIC_SOURCE_FORMAT.format(id=node_id, code=source_code) - return Tool(get_function_source_by_id, name=td.AgenticToolName.GET_FUNCTION_SOURCE) + return Tool( + get_function_source_by_id, + name=td.AgenticToolName.GET_FUNCTION_SOURCE, + description=td.GET_FUNCTION_SOURCE, + ) diff --git a/codebase_rag/tools/shell_command.py b/codebase_rag/tools/shell_command.py index 2a4d3aff0..02f682546 100644 --- a/codebase_rag/tools/shell_command.py +++ b/codebase_rag/tools/shell_command.py @@ -58,6 +58,8 @@ def _has_subshell(command: str) -> str | None: class CommandGroup: + __slots__ = ("commands", "operator") + def __init__(self, commands: list[str], operator: str | None = None): self.commands = commands self.operator = operator @@ -152,12 +154,12 @@ def _is_dangerous_rm_path(cmd_parts: list[str], project_root: Path) -> tuple[boo resolved_str = str(resolved) if resolved == resolved.parent: return True, "rm targeting root directory" - parts = resolved.parts - if len(parts) >= 2 and parts[1] in cs.SHELL_SYSTEM_DIRECTORIES: - return True, f"rm targeting system directory: {resolved_str}" try: resolved.relative_to(project_root) except ValueError: + parts = resolved.parts + if len(parts) >= 2 and parts[1] in cs.SHELL_SYSTEM_DIRECTORIES: + return True, f"rm targeting system directory: {resolved_str}" return True, f"rm targeting path outside project: {resolved_str}" return False, "" @@ -263,6 +265,8 @@ def _requires_approval(command: str) -> bool: class ShellCommander: + __slots__ = ("project_root", "timeout") + def __init__(self, project_root: str = ".", timeout: int = 30): self.project_root = Path(project_root).resolve() self.timeout = timeout diff --git a/codebase_rag/tools/tool_descriptions.py b/codebase_rag/tools/tool_descriptions.py index 008c60bef..3550743e2 100644 --- a/codebase_rag/tools/tool_descriptions.py +++ b/codebase_rag/tools/tool_descriptions.py @@ -88,13 +88,19 @@ class AgenticToolName(StrEnum): ) MCP_INDEX_REPOSITORY = ( + "WARNING: Clears all data for the current project including its embeddings. " "Parse and ingest the repository into the Memgraph knowledge graph. " - "This builds a comprehensive graph of functions, classes, dependencies, and relationships. " - "Note: This preserves other projects - only the current project is re-indexed." + "Use update_repository for incremental updates. Only use when explicitly requested." +) + +MCP_UPDATE_REPOSITORY = ( + "Update the repository in the Memgraph knowledge graph without clearing existing data. " + "Use this for incremental updates." ) MCP_QUERY_CODE_GRAPH = ( "Query the codebase knowledge graph using natural language. " + "Use semantic_search unless you know the exact names of classes/functions you are searching for. " "Ask questions like 'What functions call UserService.create_user?' or " "'Show me all classes that implement the Repository interface'." ) @@ -117,6 +123,12 @@ class AgenticToolName(StrEnum): MCP_LIST_DIRECTORY = "List contents of a directory in the project." +MCP_SEMANTIC_SEARCH = ( + "Performs a semantic search for functions based on a natural language query " + "describing their purpose, returning a list of potential matches with similarity scores. " + "Requires the 'semantic' extra to be installed." +) + MCP_PARAM_PROJECT_NAME = "Name of the project to delete (e.g., 'my-project')" MCP_PARAM_CONFIRM = "Must be true to confirm the wipe operation" MCP_PARAM_NATURAL_LANGUAGE_QUERY = "Your question in plain English about the codebase" @@ -130,6 +142,16 @@ class AgenticToolName(StrEnum): MCP_PARAM_LIMIT = "Maximum number of lines to read (optional)" MCP_PARAM_CONTENT = "Content to write to the file" MCP_PARAM_DIRECTORY_PATH = "Relative path to directory from project root (default: '.')" +MCP_PARAM_TOP_K = "Max number of results to return (optional, default: 5)" +MCP_PARAM_QUESTION = ( + "A question about the codebase, architecture, functionality, or code relationships" +) + +MCP_ASK_AGENT = ( + "Ask the Code Graph RAG agent a question about the codebase. " + "Uses the full RAG pipeline to analyze the code graph and provide a detailed answer. " + "Use this for general questions about architecture, functionality, and code relationships." +) MCP_TOOLS: dict[MCPToolName, str] = { @@ -137,12 +159,15 @@ class AgenticToolName(StrEnum): MCPToolName.DELETE_PROJECT: MCP_DELETE_PROJECT, MCPToolName.WIPE_DATABASE: MCP_WIPE_DATABASE, MCPToolName.INDEX_REPOSITORY: MCP_INDEX_REPOSITORY, + MCPToolName.UPDATE_REPOSITORY: MCP_UPDATE_REPOSITORY, MCPToolName.QUERY_CODE_GRAPH: MCP_QUERY_CODE_GRAPH, MCPToolName.GET_CODE_SNIPPET: MCP_GET_CODE_SNIPPET, MCPToolName.SURGICAL_REPLACE_CODE: MCP_SURGICAL_REPLACE_CODE, MCPToolName.READ_FILE: MCP_READ_FILE, MCPToolName.WRITE_FILE: MCP_WRITE_FILE, MCPToolName.LIST_DIRECTORY: MCP_LIST_DIRECTORY, + MCPToolName.SEMANTIC_SEARCH: MCP_SEMANTIC_SEARCH, + MCPToolName.ASK_AGENT: MCP_ASK_AGENT, } AGENTIC_TOOLS: dict[AgenticToolName, str] = { diff --git a/codebase_rag/types_defs.py b/codebase_rag/types_defs.py index fb293147b..d4ac33882 100644 --- a/codebase_rag/types_defs.py +++ b/codebase_rag/types_defs.py @@ -350,7 +350,7 @@ class FunctionNodeProps(TypedDict, total=False): class MCPInputSchemaProperty(TypedDict, total=False): type: str description: str - default: str + default: str | int MCPInputSchemaProperties = dict[str, MCPInputSchemaProperty] @@ -439,36 +439,47 @@ class RelationshipSchema(NamedTuple): NODE_SCHEMAS: tuple[NodeSchema, ...] = ( NodeSchema(NodeLabel.PROJECT, "{name: string}"), NodeSchema( - NodeLabel.PACKAGE, "{qualified_name: string, name: string, path: string}" + NodeLabel.PACKAGE, + "{qualified_name: string, name: string, path: string, absolute_path: string}", ), - NodeSchema(NodeLabel.FOLDER, "{path: string, name: string}"), - NodeSchema(NodeLabel.FILE, "{path: string, name: string, extension: string}"), + NodeSchema(NodeLabel.FOLDER, "{path: string, name: string, absolute_path: string}"), NodeSchema( - NodeLabel.MODULE, "{qualified_name: string, name: string, path: string}" + NodeLabel.FILE, + "{path: string, name: string, extension: string, absolute_path: string}", + ), + NodeSchema( + NodeLabel.MODULE, + "{qualified_name: string, name: string, path: string, absolute_path: string}", ), NodeSchema( NodeLabel.CLASS, - "{qualified_name: string, name: string, decorators: list[string]}", + "{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}", ), NodeSchema( NodeLabel.FUNCTION, - "{qualified_name: string, name: string, decorators: list[string]}", + "{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}", ), NodeSchema( NodeLabel.METHOD, - "{qualified_name: string, name: string, decorators: list[string]}", + "{qualified_name: string, name: string, decorators: list[string], path: string, absolute_path: string}", + ), + NodeSchema( + NodeLabel.INTERFACE, + "{qualified_name: string, name: string, path: string, absolute_path: string}", + ), + NodeSchema( + NodeLabel.ENUM, + "{qualified_name: string, name: string, path: string, absolute_path: string}", ), - NodeSchema(NodeLabel.INTERFACE, "{qualified_name: string, name: string}"), - NodeSchema(NodeLabel.ENUM, "{qualified_name: string, name: string}"), NodeSchema(NodeLabel.TYPE, "{qualified_name: string, name: string}"), NodeSchema(NodeLabel.UNION, "{qualified_name: string, name: string}"), NodeSchema( NodeLabel.MODULE_INTERFACE, - "{qualified_name: string, name: string, path: string}", + "{qualified_name: string, name: string, path: string, absolute_path: string}", ), NodeSchema( NodeLabel.MODULE_IMPLEMENTATION, - "{qualified_name: string, name: string, path: string, implements_module: string}", + "{qualified_name: string, name: string, path: string, absolute_path: string, implements_module: string}", ), NodeSchema(NodeLabel.EXTERNAL_PACKAGE, "{name: string, version_spec: string}"), ) diff --git a/codebase_rag/unixcoder.py b/codebase_rag/unixcoder.py index 6738fb677..cbd068696 100644 --- a/codebase_rag/unixcoder.py +++ b/codebase_rag/unixcoder.py @@ -190,6 +190,17 @@ def generate( class Beam: + __slots__ = ( + "_eos", + "device", + "eosTop", + "finished", + "nextYs", + "prevKs", + "scores", + "size", + ) + def __init__(self, size: int, eos: int, device: torch.device) -> None: self.size = size self.device = device diff --git a/codebase_rag/utils/fqn_resolver.py b/codebase_rag/utils/fqn_resolver.py index 470c6cc8f..ba3fe9dcd 100644 --- a/codebase_rag/utils/fqn_resolver.py +++ b/codebase_rag/utils/fqn_resolver.py @@ -40,7 +40,7 @@ def resolve_fqn_from_ast( return SEPARATOR_DOT.join(full_parts) except Exception as e: - logger.debug(ls.FQN_RESOLVE_FAILED.format(path=file_path, error=e)) + logger.debug(ls.FQN_RESOLVE_FAILED, path=file_path, error=e) return None @@ -73,7 +73,7 @@ def walk(node: Node) -> str | None: return walk(root_node) except Exception as e: - logger.debug(ls.FQN_FIND_FAILED.format(fqn=target_fqn, path=file_path, error=e)) + logger.debug(ls.FQN_FIND_FAILED, fqn=target_fqn, path=file_path, error=e) return None @@ -102,6 +102,6 @@ def walk(node: Node) -> None: walk(root_node) except Exception as e: - logger.debug(ls.FQN_EXTRACT_FAILED.format(path=file_path, error=e)) + logger.debug(ls.FQN_EXTRACT_FAILED, path=file_path, error=e) return functions diff --git a/codebase_rag/utils/source_extraction.py b/codebase_rag/utils/source_extraction.py index 548243a5f..20969db56 100644 --- a/codebase_rag/utils/source_extraction.py +++ b/codebase_rag/utils/source_extraction.py @@ -21,22 +21,28 @@ def extract_source_lines( return None try: - with open(file_path, encoding=encoding) as f: - lines = f.readlines() - - if start_line > len(lines) or end_line > len(lines): - logger.warning( - ls.SOURCE_RANGE_EXCEEDS.format( - start=start_line, - end=end_line, - length=len(lines), - path=file_path, - ) + raw_bytes = file_path.read_bytes() + text = raw_bytes.decode(encoding) + lines = text.splitlines(keepends=True) + + if not lines: + return None + + if start_line > len(lines) or end_line > len(lines): + logger.warning( + ls.SOURCE_RANGE_EXCEEDS.format( + start=start_line, + end=end_line, + length=len(lines), + path=file_path, ) + ) + end_line = min(end_line, len(lines)) + if start_line > len(lines): return None - extracted_lines = lines[start_line - 1 : end_line] - return "".join(extracted_lines).strip() + extracted_lines = lines[start_line - 1 : end_line] + return "".join(extracted_lines).strip() except Exception as e: logger.warning(ls.SOURCE_EXTRACT_FAILED.format(path=file_path, error=e)) @@ -56,7 +62,7 @@ def extract_source_with_fallback( if ast_result := ast_extractor(qualified_name, file_path): return str(ast_result) except Exception as e: - logger.debug(ls.SOURCE_AST_FAILED.format(name=qualified_name, error=e)) + logger.debug(ls.SOURCE_AST_FAILED, name=qualified_name, error=e) return extract_source_lines(file_path, start_line, end_line, encoding) diff --git a/codebase_rag/utils/token_utils.py b/codebase_rag/utils/token_utils.py new file mode 100644 index 000000000..031262d06 --- /dev/null +++ b/codebase_rag/utils/token_utils.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import json +from functools import cache + +import tiktoken +from loguru import logger + +from .. import constants as cs +from .. import logs as ls +from ..types_defs import ResultRow + + +@cache +def _get_encoding() -> tiktoken.Encoding: + return tiktoken.get_encoding(cs.TIKTOKEN_ENCODING) + + +def count_tokens(text: str) -> int: + return len(_get_encoding().encode(text)) + + +def truncate_results_by_tokens( + results: list[ResultRow], + max_tokens: int, + original_total: int | None = None, +) -> tuple[list[ResultRow], int, bool]: + if not results: + return results, 0, False + + kept: list[ResultRow] = [] + total_tokens = 0 + total_for_log = original_total if original_total is not None else len(results) + + for row in results: + row_text = json.dumps(row, default=str) + row_tokens = count_tokens(row_text) + + if total_tokens + row_tokens > max_tokens and kept: + logger.warning( + ls.QUERY_RESULTS_TRUNCATED.format( + kept=len(kept), + total=total_for_log, + tokens=total_tokens, + max_tokens=max_tokens, + ) + ) + return kept, total_tokens, True + + kept.append(row) + total_tokens += row_tokens + + return kept, total_tokens, False diff --git a/codebase_rag/vector_store.py b/codebase_rag/vector_store.py index 6580b43c2..21ae30b70 100644 --- a/codebase_rag/vector_store.py +++ b/codebase_rag/vector_store.py @@ -1,3 +1,6 @@ +import time +from collections.abc import Sequence + from loguru import logger from . import logs as ls @@ -5,12 +8,20 @@ from .constants import PAYLOAD_NODE_ID, PAYLOAD_QUALIFIED_NAME from .utils.dependencies import has_qdrant_client +_RETRIEVE_BATCH_SIZE = 1000 + if has_qdrant_client(): from qdrant_client import QdrantClient from qdrant_client.models import Distance, PointStruct, VectorParams _CLIENT: QdrantClient | None = None + def close_qdrant_client() -> None: + global _CLIENT + if _CLIENT is not None: + _CLIENT.close() + _CLIENT = None + def get_qdrant_client() -> QdrantClient: global _CLIENT if _CLIENT is None: @@ -24,28 +35,92 @@ def get_qdrant_client() -> QdrantClient: ) return _CLIENT + def _upsert_with_retry(points: list[PointStruct]) -> None: + client = get_qdrant_client() + max_attempts = settings.QDRANT_UPSERT_RETRIES + base_delay = settings.QDRANT_RETRY_BASE_DELAY + for attempt in range(1, max_attempts + 1): + try: + client.upsert( + collection_name=settings.QDRANT_COLLECTION_NAME, + points=points, + ) + return + except Exception as e: + if attempt == max_attempts: + raise + delay = base_delay * (2 ** (attempt - 1)) + logger.warning( + ls.EMBEDDING_STORE_RETRY.format( + attempt=attempt, max_attempts=max_attempts, delay=delay, error=e + ) + ) + time.sleep(delay) + def store_embedding( node_id: int, embedding: list[float], qualified_name: str ) -> None: + store_embedding_batch([(node_id, embedding, qualified_name)]) + + def store_embedding_batch( + points: Sequence[tuple[int, list[float], str]], + ) -> int: + if not points: + return 0 + point_structs = [ + PointStruct( + id=node_id, + vector=embedding, + payload={ + PAYLOAD_NODE_ID: node_id, + PAYLOAD_QUALIFIED_NAME: qualified_name, + }, + ) + for node_id, embedding, qualified_name in points + ] + try: + _upsert_with_retry(point_structs) + logger.debug(ls.EMBEDDING_BATCH_STORED.format(count=len(point_structs))) + return len(point_structs) + except Exception as e: + logger.warning(ls.EMBEDDING_BATCH_FAILED.format(error=e)) + return 0 + + def delete_project_embeddings(project_name: str, node_ids: Sequence[int]) -> None: + if not node_ids: + return try: + logger.info( + ls.QDRANT_DELETE_PROJECT.format( + count=len(node_ids), project=project_name + ) + ) client = get_qdrant_client() - client.upsert( + client.delete( collection_name=settings.QDRANT_COLLECTION_NAME, - points=[ - PointStruct( - id=node_id, - vector=embedding, - payload={ - PAYLOAD_NODE_ID: node_id, - PAYLOAD_QUALIFIED_NAME: qualified_name, - }, - ) - ], + points_selector=list(node_ids), ) + logger.info(ls.QDRANT_DELETE_PROJECT_DONE.format(project=project_name)) except Exception as e: logger.warning( - ls.EMBEDDING_STORE_FAILED.format(name=qualified_name, error=e) + ls.QDRANT_DELETE_PROJECT_FAILED.format(project=project_name, error=e) + ) + + def verify_stored_ids(expected_ids: set[int]) -> set[int]: + if not expected_ids: + return set() + client = get_qdrant_client() + found_ids: set[int] = set() + ids_list = list(expected_ids) + for i in range(0, len(ids_list), _RETRIEVE_BATCH_SIZE): + points = client.retrieve( + collection_name=settings.QDRANT_COLLECTION_NAME, + ids=ids_list[i : i + _RETRIEVE_BATCH_SIZE], + with_payload=False, + with_vectors=False, ) + found_ids.update(p.id for p in points if isinstance(p.id, int)) + return found_ids def search_embeddings( query_embedding: list[float], top_k: int | None = None @@ -69,11 +144,25 @@ def search_embeddings( else: + def close_qdrant_client() -> None: + pass + def store_embedding( node_id: int, embedding: list[float], qualified_name: str ) -> None: pass + def store_embedding_batch( + points: Sequence[tuple[int, list[float], str]], + ) -> int: + return 0 + + def delete_project_embeddings(project_name: str, node_ids: Sequence[int]) -> None: + pass + + def verify_stored_ids(expected_ids: set[int]) -> set[int]: + return set() + def search_embeddings( query_embedding: list[float], top_k: int | None = None ) -> list[tuple[int, float]]: diff --git a/codec/schema.proto b/codec/schema.proto index fcd28e6c2..06832c97f 100644 --- a/codec/schema.proto +++ b/codec/schema.proto @@ -102,6 +102,10 @@ message GraphCodeIndex { ExternalPackage external_package = 9; ModuleImplementation module_implementation = 10; ModuleInterface module_interface = 11; + Interface interface_node = 12; + Enum enum_node = 13; + Type type_node = 14; + Union union_node = 15; } } @@ -123,6 +127,8 @@ message GraphCodeIndex { DEPENDS_ON_EXTERNAL = 11; IMPLEMENTS_MODULE = 12; IMPLEMENTS = 13; + EXPORTS = 14; + EXPORTS_MODULE = 15; } RelationshipType type = 1; @@ -232,3 +238,35 @@ message GraphCodeIndex { repeated string decorators = 6; bool is_exported = 7; } + + message Interface { + // Primary Key + string qualified_name = 1; + + string name = 2; + string path = 3; + string absolute_path = 4; + } + + message Enum { + // Primary Key + string qualified_name = 1; + + string name = 2; + string path = 3; + string absolute_path = 4; + } + + message Type { + // Primary Key + string qualified_name = 1; + + string name = 2; + } + + message Union { + // Primary Key + string qualified_name = 1; + + string name = 2; + } diff --git a/codec/schema_pb2.py b/codec/schema_pb2.py index 5dd666f71..fcae069dd 100644 --- a/codec/schema_pb2.py +++ b/codec/schema_pb2.py @@ -1,61 +1,62 @@ +# -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! -# NO CHECKED-IN PROTOBUF GENCODE # source: codec/schema.proto -# Protobuf Python Version: 6.33.1 """Generated protocol buffer code.""" - +from google.protobuf.internal import builder as _builder from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import runtime_version as _runtime_version -from google.protobuf import struct_pb2 as _struct_pb2 from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -_runtime_version.ValidateProtobufRuntimeVersion( - _runtime_version.Domain.PUBLIC, 6, 33, 1, "", "codec/schema.proto" -) # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x12\x63odec/schema.proto\x12\x0cgraphcode.v1\x1a\x1cgoogle/protobuf/struct.proto"f\n\x0eGraphCodeIndex\x12!\n\x05nodes\x18\x01 \x03(\x0b\x32\x12.graphcode.v1.Node\x12\x31\n\rrelationships\x18\x02 \x03(\x0b\x32\x1a.graphcode.v1.Relationship"\x93\x04\n\x04Node\x12(\n\x07project\x18\x01 \x01(\x0b\x32\x15.graphcode.v1.ProjectH\x00\x12(\n\x07package\x18\x02 \x01(\x0b\x32\x15.graphcode.v1.PackageH\x00\x12&\n\x06\x66older\x18\x03 \x01(\x0b\x32\x14.graphcode.v1.FolderH\x00\x12&\n\x06module\x18\x04 \x01(\x0b\x32\x14.graphcode.v1.ModuleH\x00\x12)\n\nclass_node\x18\x05 \x01(\x0b\x32\x13.graphcode.v1.ClassH\x00\x12*\n\x08\x66unction\x18\x06 \x01(\x0b\x32\x16.graphcode.v1.FunctionH\x00\x12&\n\x06method\x18\x07 \x01(\x0b\x32\x14.graphcode.v1.MethodH\x00\x12"\n\x04\x66ile\x18\x08 \x01(\x0b\x32\x12.graphcode.v1.FileH\x00\x12\x39\n\x10\x65xternal_package\x18\t \x01(\x0b\x32\x1d.graphcode.v1.ExternalPackageH\x00\x12\x43\n\x15module_implementation\x18\n \x01(\x0b\x32".graphcode.v1.ModuleImplementationH\x00\x12\x39\n\x10module_interface\x18\x0b \x01(\x0b\x32\x1d.graphcode.v1.ModuleInterfaceH\x00\x42\t\n\x07payload"\xe9\x03\n\x0cRelationship\x12\x39\n\x04type\x18\x01 \x01(\x0e\x32+.graphcode.v1.Relationship.RelationshipType\x12\x11\n\tsource_id\x18\x02 \x01(\t\x12\x11\n\ttarget_id\x18\x03 \x01(\t\x12+\n\nproperties\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x14\n\x0csource_label\x18\x05 \x01(\t\x12\x14\n\x0ctarget_label\x18\x06 \x01(\t"\x9e\x02\n\x10RelationshipType\x12!\n\x1dRELATIONSHIP_TYPE_UNSPECIFIED\x10\x00\x12\x14\n\x10\x43ONTAINS_PACKAGE\x10\x01\x12\x13\n\x0f\x43ONTAINS_FOLDER\x10\x02\x12\x11\n\rCONTAINS_FILE\x10\x03\x12\x13\n\x0f\x43ONTAINS_MODULE\x10\x04\x12\x0b\n\x07\x44\x45\x46INES\x10\x05\x12\x12\n\x0e\x44\x45\x46INES_METHOD\x10\x06\x12\x0b\n\x07IMPORTS\x10\x07\x12\x0c\n\x08INHERITS\x10\x08\x12\r\n\tOVERRIDES\x10\t\x12\t\n\x05\x43\x41LLS\x10\n\x12\x17\n\x13\x44\x45PENDS_ON_EXTERNAL\x10\x0b\x12\x15\n\x11IMPLEMENTS_MODULE\x10\x0c\x12\x0e\n\nIMPLEMENTS\x10\r"\x17\n\x07Project\x12\x0c\n\x04name\x18\x01 \x01(\t"=\n\x07Package\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t"$\n\x06\x46older\x12\x0c\n\x04path\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t"5\n\x04\x46ile\x12\x0c\n\x04path\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\textension\x18\x03 \x01(\t"<\n\x06Module\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t"e\n\x14ModuleImplementation\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\x12\x19\n\x11implements_module\x18\x04 \x01(\t"E\n\x0fModuleInterface\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t"\x1f\n\x0f\x45xternalPackage\x12\x0c\n\x04name\x18\x01 \x01(\t"\x92\x01\n\x08\x46unction\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\x12\x13\n\x0bis_exported\x18\x07 \x01(\x08"{\n\x06Method\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t"\x8f\x01\n\x05\x43lass\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\x12\x13\n\x0bis_exported\x18\x07 \x01(\x08\x62\x06proto3' -) +from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x12\x63odec/schema.proto\x12\x0cgraphcode.v1\x1a\x1cgoogle/protobuf/struct.proto\"f\n\x0eGraphCodeIndex\x12!\n\x05nodes\x18\x01 \x03(\x0b\x32\x12.graphcode.v1.Node\x12\x31\n\rrelationships\x18\x02 \x03(\x0b\x32\x1a.graphcode.v1.Relationship\"\xc3\x05\n\x04Node\x12(\n\x07project\x18\x01 \x01(\x0b\x32\x15.graphcode.v1.ProjectH\x00\x12(\n\x07package\x18\x02 \x01(\x0b\x32\x15.graphcode.v1.PackageH\x00\x12&\n\x06\x66older\x18\x03 \x01(\x0b\x32\x14.graphcode.v1.FolderH\x00\x12&\n\x06module\x18\x04 \x01(\x0b\x32\x14.graphcode.v1.ModuleH\x00\x12)\n\nclass_node\x18\x05 \x01(\x0b\x32\x13.graphcode.v1.ClassH\x00\x12*\n\x08\x66unction\x18\x06 \x01(\x0b\x32\x16.graphcode.v1.FunctionH\x00\x12&\n\x06method\x18\x07 \x01(\x0b\x32\x14.graphcode.v1.MethodH\x00\x12\"\n\x04\x66ile\x18\x08 \x01(\x0b\x32\x12.graphcode.v1.FileH\x00\x12\x39\n\x10\x65xternal_package\x18\t \x01(\x0b\x32\x1d.graphcode.v1.ExternalPackageH\x00\x12\x43\n\x15module_implementation\x18\n \x01(\x0b\x32\".graphcode.v1.ModuleImplementationH\x00\x12\x39\n\x10module_interface\x18\x0b \x01(\x0b\x32\x1d.graphcode.v1.ModuleInterfaceH\x00\x12\x31\n\x0einterface_node\x18\x0c \x01(\x0b\x32\x17.graphcode.v1.InterfaceH\x00\x12\'\n\tenum_node\x18\r \x01(\x0b\x32\x12.graphcode.v1.EnumH\x00\x12\'\n\ttype_node\x18\x0e \x01(\x0b\x32\x12.graphcode.v1.TypeH\x00\x12)\n\nunion_node\x18\x0f \x01(\x0b\x32\x13.graphcode.v1.UnionH\x00\x42\t\n\x07payload\"\x8a\x04\n\x0cRelationship\x12\x39\n\x04type\x18\x01 \x01(\x0e\x32+.graphcode.v1.Relationship.RelationshipType\x12\x11\n\tsource_id\x18\x02 \x01(\t\x12\x11\n\ttarget_id\x18\x03 \x01(\t\x12+\n\nproperties\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x14\n\x0csource_label\x18\x05 \x01(\t\x12\x14\n\x0ctarget_label\x18\x06 \x01(\t\"\xbf\x02\n\x10RelationshipType\x12!\n\x1dRELATIONSHIP_TYPE_UNSPECIFIED\x10\x00\x12\x14\n\x10\x43ONTAINS_PACKAGE\x10\x01\x12\x13\n\x0f\x43ONTAINS_FOLDER\x10\x02\x12\x11\n\rCONTAINS_FILE\x10\x03\x12\x13\n\x0f\x43ONTAINS_MODULE\x10\x04\x12\x0b\n\x07\x44\x45\x46INES\x10\x05\x12\x12\n\x0e\x44\x45\x46INES_METHOD\x10\x06\x12\x0b\n\x07IMPORTS\x10\x07\x12\x0c\n\x08INHERITS\x10\x08\x12\r\n\tOVERRIDES\x10\t\x12\t\n\x05\x43\x41LLS\x10\n\x12\x17\n\x13\x44\x45PENDS_ON_EXTERNAL\x10\x0b\x12\x15\n\x11IMPLEMENTS_MODULE\x10\x0c\x12\x0e\n\nIMPLEMENTS\x10\r\x12\x0b\n\x07\x45XPORTS\x10\x0e\x12\x12\n\x0e\x45XPORTS_MODULE\x10\x0f\"\x17\n\x07Project\x12\x0c\n\x04name\x18\x01 \x01(\t\"=\n\x07Package\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\"$\n\x06\x46older\x12\x0c\n\x04path\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"5\n\x04\x46ile\x12\x0c\n\x04path\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\textension\x18\x03 \x01(\t\"<\n\x06Module\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\"e\n\x14ModuleImplementation\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\x12\x19\n\x11implements_module\x18\x04 \x01(\t\"E\n\x0fModuleInterface\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\"\x1f\n\x0f\x45xternalPackage\x12\x0c\n\x04name\x18\x01 \x01(\t\"\x92\x01\n\x08\x46unction\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\x12\x13\n\x0bis_exported\x18\x07 \x01(\x08\"{\n\x06Method\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\"\x8f\x01\n\x05\x43lass\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tdocstring\x18\x03 \x01(\t\x12\x12\n\nstart_line\x18\x04 \x01(\x05\x12\x10\n\x08\x65nd_line\x18\x05 \x01(\x05\x12\x12\n\ndecorators\x18\x06 \x03(\t\x12\x13\n\x0bis_exported\x18\x07 \x01(\x08\"V\n\tInterface\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\x12\x15\n\rabsolute_path\x18\x04 \x01(\t\"Q\n\x04\x45num\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04path\x18\x03 \x01(\t\x12\x15\n\rabsolute_path\x18\x04 \x01(\t\",\n\x04Type\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\"-\n\x05Union\x12\x16\n\x0equalified_name\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\tb\x06proto3') + +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'codec.schema_pb2', globals()) +if _descriptor._USE_C_DESCRIPTORS == False: -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "codec.schema_pb2", _globals) -if not _descriptor._USE_C_DESCRIPTORS: - DESCRIPTOR._loaded_options = None - _globals["_GRAPHCODEINDEX"]._serialized_start = 66 - _globals["_GRAPHCODEINDEX"]._serialized_end = 168 - _globals["_NODE"]._serialized_start = 171 - _globals["_NODE"]._serialized_end = 702 - _globals["_RELATIONSHIP"]._serialized_start = 705 - _globals["_RELATIONSHIP"]._serialized_end = 1194 - _globals["_RELATIONSHIP_RELATIONSHIPTYPE"]._serialized_start = 908 - _globals["_RELATIONSHIP_RELATIONSHIPTYPE"]._serialized_end = 1194 - _globals["_PROJECT"]._serialized_start = 1196 - _globals["_PROJECT"]._serialized_end = 1219 - _globals["_PACKAGE"]._serialized_start = 1221 - _globals["_PACKAGE"]._serialized_end = 1282 - _globals["_FOLDER"]._serialized_start = 1284 - _globals["_FOLDER"]._serialized_end = 1320 - _globals["_FILE"]._serialized_start = 1322 - _globals["_FILE"]._serialized_end = 1375 - _globals["_MODULE"]._serialized_start = 1377 - _globals["_MODULE"]._serialized_end = 1437 - _globals["_MODULEIMPLEMENTATION"]._serialized_start = 1439 - _globals["_MODULEIMPLEMENTATION"]._serialized_end = 1540 - _globals["_MODULEINTERFACE"]._serialized_start = 1542 - _globals["_MODULEINTERFACE"]._serialized_end = 1611 - _globals["_EXTERNALPACKAGE"]._serialized_start = 1613 - _globals["_EXTERNALPACKAGE"]._serialized_end = 1644 - _globals["_FUNCTION"]._serialized_start = 1647 - _globals["_FUNCTION"]._serialized_end = 1793 - _globals["_METHOD"]._serialized_start = 1795 - _globals["_METHOD"]._serialized_end = 1918 - _globals["_CLASS"]._serialized_start = 1921 - _globals["_CLASS"]._serialized_end = 2064 + DESCRIPTOR._options = None + _GRAPHCODEINDEX._serialized_start=66 + _GRAPHCODEINDEX._serialized_end=168 + _NODE._serialized_start=171 + _NODE._serialized_end=878 + _RELATIONSHIP._serialized_start=881 + _RELATIONSHIP._serialized_end=1403 + _RELATIONSHIP_RELATIONSHIPTYPE._serialized_start=1084 + _RELATIONSHIP_RELATIONSHIPTYPE._serialized_end=1403 + _PROJECT._serialized_start=1405 + _PROJECT._serialized_end=1428 + _PACKAGE._serialized_start=1430 + _PACKAGE._serialized_end=1491 + _FOLDER._serialized_start=1493 + _FOLDER._serialized_end=1529 + _FILE._serialized_start=1531 + _FILE._serialized_end=1584 + _MODULE._serialized_start=1586 + _MODULE._serialized_end=1646 + _MODULEIMPLEMENTATION._serialized_start=1648 + _MODULEIMPLEMENTATION._serialized_end=1749 + _MODULEINTERFACE._serialized_start=1751 + _MODULEINTERFACE._serialized_end=1820 + _EXTERNALPACKAGE._serialized_start=1822 + _EXTERNALPACKAGE._serialized_end=1853 + _FUNCTION._serialized_start=1856 + _FUNCTION._serialized_end=2002 + _METHOD._serialized_start=2004 + _METHOD._serialized_end=2127 + _CLASS._serialized_start=2130 + _CLASS._serialized_end=2273 + _INTERFACE._serialized_start=2275 + _INTERFACE._serialized_end=2361 + _ENUM._serialized_start=2363 + _ENUM._serialized_end=2444 + _TYPE._serialized_start=2446 + _TYPE._serialized_end=2490 + _UNION._serialized_start=2492 + _UNION._serialized_end=2537 # @@protoc_insertion_point(module_scope) diff --git a/docs/advanced/adding-languages.md b/docs/advanced/adding-languages.md new file mode 100644 index 000000000..5ddc87168 --- /dev/null +++ b/docs/advanced/adding-languages.md @@ -0,0 +1,104 @@ +--- +description: "Add support for new programming languages to Code-Graph-RAG using Tree-sitter grammars." +--- + +# Adding Languages + +Code-Graph-RAG makes it easy to add support for any language that has a Tree-sitter grammar. The system automatically handles grammar compilation and integration. + +!!! warning + While you can add languages yourself, we recommend waiting for official full support to ensure optimal parsing quality, comprehensive feature coverage, and robust integration. [Submit a language request](https://github.com/vitali87/code-graph-rag/issues) if you need a specific language supported. + +## Quick Start + +Use the built-in language management tool: + +```bash +cgr language add-grammar +``` + +Examples: + +```bash +cgr language add-grammar c-sharp +cgr language add-grammar php +cgr language add-grammar ruby +cgr language add-grammar kotlin +``` + +## Custom Grammar Repositories + +For languages hosted outside the standard tree-sitter organization: + +```bash +cgr language add-grammar --grammar-url https://github.com/custom/tree-sitter-mylang +``` + +## What Happens Automatically + +When you add a language, the tool automatically: + +1. **Downloads the Grammar**: Clones the tree-sitter grammar repository as a git submodule +2. **Detects Configuration**: Auto-extracts language metadata from `tree-sitter.json` +3. **Analyzes Node Types**: Automatically identifies AST node types for functions/methods, classes/structs, modules/files, and function calls +4. **Compiles Bindings**: Builds Python bindings from the grammar source +5. **Updates Configuration**: Adds the language to `codebase_rag/language_config.py` +6. **Enables Parsing**: Makes the language immediately available for codebase analysis + +## Example: Adding C# Support + +```bash +$ cgr language add-grammar c-sharp +Using default tree-sitter URL: https://github.com/tree-sitter/tree-sitter-c-sharp +Adding submodule from https://github.com/tree-sitter/tree-sitter-c-sharp... +Successfully added submodule at grammars/tree-sitter-c-sharp +Auto-detected language: c-sharp +Auto-detected file extensions: ['cs'] +Auto-detected node types: +Functions: ['destructor_declaration', 'method_declaration', 'constructor_declaration'] +Classes: ['struct_declaration', 'enum_declaration', 'interface_declaration', 'class_declaration'] +Modules: ['compilation_unit', 'file_scoped_namespace_declaration', 'namespace_declaration'] +Calls: ['invocation_expression'] + +Language 'c-sharp' has been added to the configuration! +Updated codebase_rag/language_config.py +``` + +## Managing Languages + +```bash +cgr language list-languages + +cgr language remove-language +``` + +## Language Configuration + +Each language is defined in `codebase_rag/language_config.py`: + +```python +"language-name": LanguageConfig( + name="language-name", + file_extensions=[".ext1", ".ext2"], + function_node_types=["function_declaration", "method_declaration"], + class_node_types=["class_declaration", "struct_declaration"], + module_node_types=["compilation_unit", "source_file"], + call_node_types=["call_expression", "method_invocation"], +), +``` + +## Troubleshooting + +**Grammar not found**: Use a custom URL if the automatic URL doesn't work: + +```bash +cgr language add-grammar --grammar-url https://github.com/custom/tree-sitter-mylang +``` + +**Version incompatibility**: If you get "Incompatible Language version" errors: + +```bash +uv add tree-sitter@latest +``` + +**Missing node types**: The tool automatically detects common node patterns, but you can manually adjust the configuration in `language_config.py` if needed. diff --git a/docs/advanced/building-binaries.md b/docs/advanced/building-binaries.md new file mode 100644 index 000000000..b250d52c7 --- /dev/null +++ b/docs/advanced/building-binaries.md @@ -0,0 +1,15 @@ +--- +description: "Build a standalone binary of Code-Graph-RAG using PyInstaller." +--- + +# Building Binaries + +You can build a standalone binary of Code-Graph-RAG using the `build_binary.py` script. This uses PyInstaller to package the application and its dependencies into a single executable. + +## Build + +```bash +python build_binary.py +``` + +The resulting binary will be located in the `dist` directory. diff --git a/docs/advanced/ignore-patterns.md b/docs/advanced/ignore-patterns.md new file mode 100644 index 000000000..a17ad4b70 --- /dev/null +++ b/docs/advanced/ignore-patterns.md @@ -0,0 +1,28 @@ +--- +description: "Configure .cgrignore to exclude directories from Code-Graph-RAG analysis." +--- + +# Ignore Patterns + +You can specify additional directories to exclude from analysis by creating a `.cgrignore` file in your repository root. + +## Format + +``` +# Comments start with # +vendor +.custom_cache +my_build_output +``` + +## Rules + +- One directory name per line +- Lines starting with `#` are comments +- Blank lines are ignored +- Patterns are exact directory name matches (not globs) +- Patterns from `.cgrignore` are merged with `--exclude` flags and auto-detected directories + +## Default Exclusions + +Code-Graph-RAG automatically excludes common non-source directories such as `.git`, `node_modules`, `__pycache__`, `dist`, `build`, and similar. diff --git a/docs/advanced/troubleshooting.md b/docs/advanced/troubleshooting.md new file mode 100644 index 000000000..22a2dd27c --- /dev/null +++ b/docs/advanced/troubleshooting.md @@ -0,0 +1,46 @@ +--- +description: "Troubleshoot common Code-Graph-RAG issues with Memgraph, Ollama, and model configuration." +--- + +# Troubleshooting + +## Check Memgraph Connection + +- Ensure Docker containers are running: `docker compose ps` +- Verify Memgraph is accessible on port 7687 + +## View Database in Memgraph Lab + +- Open [http://localhost:3000](http://localhost:3000) +- Connect to `memgraph:7687` + +## Local Model Issues (Ollama) + +- Verify Ollama is running: `ollama list` +- Check if models are downloaded: `ollama pull llama3` +- Test Ollama API: `curl http://localhost:11434/v1/models` +- Check Ollama logs: `ollama logs` + +## General Checklist + +1. Check the logs for error details +2. Verify Memgraph connection +3. Ensure all environment variables are set +4. Review the graph schema matches your expectations +5. Run `cgr doctor` to validate your setup + +## Language Grammar Issues + +**Grammar not found**: Use a custom URL: + +```bash +cgr language add-grammar --grammar-url https://github.com/custom/tree-sitter-mylang +``` + +**Version incompatibility**: Update tree-sitter: + +```bash +uv add tree-sitter@latest +``` + +**Missing node types**: Manually adjust the configuration in `codebase_rag/language_config.py`. diff --git a/docs/architecture/graph-schema.md b/docs/architecture/graph-schema.md new file mode 100644 index 000000000..7412d7ae5 --- /dev/null +++ b/docs/architecture/graph-schema.md @@ -0,0 +1,159 @@ +--- +description: "Knowledge graph schema with node types, relationships, and language-specific AST mappings." +--- + +# Graph Schema + +The knowledge graph uses a unified schema across all supported languages. + +## Node Types + +| Label | Properties | +|-------|------------| +| Project | `{name: string}` | +| Package | `{qualified_name: string, name: string, path: string}` | +| Folder | `{path: string, name: string}` | +| File | `{path: string, name: string, extension: string}` | +| Module | `{qualified_name: string, name: string, path: string}` | +| Class | `{qualified_name: string, name: string, decorators: list[string]}` | +| Function | `{qualified_name: string, name: string, decorators: list[string]}` | +| Method | `{qualified_name: string, name: string, decorators: list[string]}` | +| Interface | `{qualified_name: string, name: string}` | +| Enum | `{qualified_name: string, name: string}` | +| Type | `{qualified_name: string, name: string}` | +| Union | `{qualified_name: string, name: string}` | +| ModuleInterface | `{qualified_name: string, name: string, path: string}` | +| ModuleImplementation | `{qualified_name: string, name: string, path: string, implements_module: string}` | +| ExternalPackage | `{name: string, version_spec: string}` | + +## Relationships + +| Source | Relationship | Target | +|--------|-------------|--------| +| Project, Package, Folder | CONTAINS_PACKAGE | Package | +| Project, Package, Folder | CONTAINS_FOLDER | Folder | +| Project, Package, Folder | CONTAINS_FILE | File | +| Project, Package, Folder | CONTAINS_MODULE | Module | +| Module | DEFINES | Class, Function | +| Class | DEFINES_METHOD | Method | +| Module | IMPORTS | Module | +| Module | EXPORTS | Class, Function | +| Module | EXPORTS_MODULE | ModuleInterface | +| Module | IMPLEMENTS_MODULE | ModuleImplementation | +| Class | INHERITS | Class | +| Class | IMPLEMENTS | Interface | +| Method | OVERRIDES | Method | +| ModuleImplementation | IMPLEMENTS | ModuleInterface | +| Project | DEPENDS_ON_EXTERNAL | ExternalPackage | +| Function, Method | CALLS | Function, Method | + +## Language-Specific AST Mappings + +### C++ + +- `class_specifier` +- `declaration` +- `enum_specifier` +- `field_declaration` +- `function_definition` +- `lambda_expression` +- `struct_specifier` +- `template_declaration` +- `union_specifier` + +### Java + +- `annotation_type_declaration` +- `class_declaration` +- `constructor_declaration` +- `enum_declaration` +- `interface_declaration` +- `method_declaration` +- `record_declaration` + +### JavaScript + +- `arrow_function` +- `class` +- `class_declaration` +- `function_declaration` +- `function_expression` +- `generator_function_declaration` +- `method_definition` + +### Lua + +- `function_declaration` +- `function_definition` + +### Python + +- `class_definition` +- `function_definition` + +### Rust + +- `closure_expression` +- `enum_item` +- `function_item` +- `function_signature_item` +- `impl_item` +- `struct_item` +- `trait_item` +- `type_item` +- `union_item` + +### TypeScript + +- `abstract_class_declaration` +- `arrow_function` +- `class` +- `class_declaration` +- `enum_declaration` +- `function_declaration` +- `function_expression` +- `function_signature` +- `generator_function_declaration` +- `interface_declaration` +- `internal_module` +- `method_definition` +- `type_alias_declaration` + +### C# + +- `anonymous_method_expression` +- `class_declaration` +- `constructor_declaration` +- `destructor_declaration` +- `enum_declaration` +- `function_pointer_type` +- `interface_declaration` +- `lambda_expression` +- `local_function_statement` +- `method_declaration` +- `struct_declaration` + +### Go + +- `function_declaration` +- `method_declaration` +- `type_declaration` + +### PHP + +- `anonymous_function` +- `arrow_function` +- `class_declaration` +- `enum_declaration` +- `function_definition` +- `function_static_declaration` +- `interface_declaration` +- `trait_declaration` + +### Scala + +- `class_definition` +- `function_declaration` +- `function_definition` +- `object_definition` +- `trait_definition` diff --git a/docs/architecture/language-support.md b/docs/architecture/language-support.md new file mode 100644 index 000000000..9398b05e5 --- /dev/null +++ b/docs/architecture/language-support.md @@ -0,0 +1,34 @@ +--- +description: "Supported programming languages and their feature coverage in Code-Graph-RAG." +--- + +# Language Support + +Code-Graph-RAG uses Tree-sitter for language-agnostic AST parsing with a unified graph schema across all languages. + +## Support Matrix + +| Language | Status | Extensions | Functions | Classes/Structs | Modules | Package Detection | Additional Features | +|----------|--------|------------|-----------|-----------------|---------|-------------------|---------------------| +| C++ | Fully Supported | .cpp, .h, .hpp, .cc, .cxx, .hxx, .hh, .ixx, .cppm, .ccm | Yes | Yes | Yes | Yes | Constructors, destructors, operator overloading, templates, lambdas, C++20 modules, namespaces | +| Java | Fully Supported | .java | Yes | Yes | Yes | No | Generics, annotations, modern features (records/sealed classes), concurrency, reflection | +| JavaScript | Fully Supported | .js, .jsx | Yes | Yes | Yes | No | ES6 modules, CommonJS, prototype methods, object methods, arrow functions | +| Lua | Fully Supported | .lua | Yes | No | Yes | No | Local/global functions, metatables, closures, coroutines | +| Python | Fully Supported | .py | Yes | Yes | Yes | Yes | Type inference, decorators, nested functions | +| Rust | Fully Supported | .rs | Yes | Yes | Yes | Yes | impl blocks, associated functions | +| TypeScript | Fully Supported | .ts, .tsx | Yes | Yes | Yes | No | Interfaces, type aliases, enums, namespaces, ES6/CommonJS modules | +| C# | In Development | .cs | Yes | Yes | Yes | No | Classes, interfaces, generics (planned) | +| Go | In Development | .go | Yes | Yes | Yes | No | Methods, type declarations | +| PHP | Fully Supported | .php | Yes | Yes | Yes | No | Classes, interfaces, traits, enums, namespaces, PHP 8 attributes | +| Scala | In Development | .scala, .sc | Yes | Yes | Yes | No | Case classes, objects | + +## Language-Agnostic Design + +All languages share a unified graph schema, meaning queries work the same way regardless of language. You can query across languages in the same knowledge graph when analyzing polyglot repositories. + +## Adding New Languages + +Code-Graph-RAG makes it easy to add support for any language that has a Tree-sitter grammar. See the [Adding Languages](../advanced/adding-languages.md) guide. + +!!! tip + While you can add languages yourself, we recommend waiting for official full support for optimal parsing quality and comprehensive feature coverage. [Submit a language request](https://github.com/vitali87/code-graph-rag/issues) if you need a specific language supported. diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md new file mode 100644 index 000000000..5181f9d87 --- /dev/null +++ b/docs/architecture/overview.md @@ -0,0 +1,51 @@ +--- +description: "Architecture overview of Code-Graph-RAG's two-component system for codebase analysis." +--- + +# Architecture Overview + +Code-Graph-RAG consists of two main components that work together to analyze and query codebases. + +## Components + +### 1. Multi-Language Parser + +A Tree-sitter based parsing system that analyzes codebases and ingests data into Memgraph. + +- Uses Tree-sitter for robust, language-agnostic AST parsing +- Extracts functions, classes, methods, modules, and their relationships +- Supports 11 programming languages with a unified graph schema +- Handles complex patterns like nested functions, class hierarchies, and cross-module calls + +### 2. RAG System (`codebase_rag/`) + +An interactive CLI for querying the stored knowledge graph. + +- Translates natural language questions into Cypher queries +- Retrieves source code snippets for found elements +- Supports AI-powered code editing with AST-based targeting +- Provides code optimization with interactive approval workflow + +## Data Flow + +``` +Source Code → Tree-sitter Parser → AST Analysis → Memgraph Knowledge Graph + ↓ +User Query → AI Model (Cypher Gen) → Cypher Query → Graph Results → Response +``` + +## Key Dependencies + +| Dependency | Purpose | +|-----------|---------| +| `tree-sitter` | Language-agnostic AST parsing | +| `pymgclient` | Memgraph database adapter | +| `pydantic-ai` | Agent framework for LLM integration | +| `pydantic-settings` | Settings management | +| `mcp` | Model Context Protocol SDK | +| `typer` | CLI framework | +| `rich` | Terminal rendering | +| `prompt-toolkit` | Interactive command line | +| `diff-match-patch` | Code patching | +| `watchdog` | Filesystem events monitoring | +| `huggingface-hub` | UniXcoder model download | diff --git a/docs/assets/demo.gif b/docs/assets/demo.gif new file mode 100644 index 000000000..0260a2f83 Binary files /dev/null and b/docs/assets/demo.gif differ diff --git a/docs/assets/favicon.png b/docs/assets/favicon.png new file mode 100644 index 000000000..7ea975f2d Binary files /dev/null and b/docs/assets/favicon.png differ diff --git a/docs/assets/logo-dark-any.png b/docs/assets/logo-dark-any.png new file mode 100644 index 000000000..56508a2d7 Binary files /dev/null and b/docs/assets/logo-dark-any.png differ diff --git a/docs/assets/logo-icon.png b/docs/assets/logo-icon.png new file mode 100644 index 000000000..5449b7e03 Binary files /dev/null and b/docs/assets/logo-icon.png differ diff --git a/docs/assets/logo-light-any.png b/docs/assets/logo-light-any.png new file mode 100644 index 000000000..89be19120 Binary files /dev/null and b/docs/assets/logo-light-any.png differ diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 000000000..2c9b04f2b --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,105 @@ +--- +description: "Contribution guidelines for Code-Graph-RAG including setup, code standards, and PR process." +--- + +# Contributing + +Thank you for your interest in contributing to Code-Graph-RAG! + +## Getting Started + +1. **Browse Issues**: Check out the [GitHub Issues](https://github.com/vitali87/code-graph-rag/issues) to find tasks that need work. Look for `good first issue` and `help wanted` labels. +2. **Pick an Issue**: Choose an issue that interests you and matches your skill level +3. **Comment on the Issue**: Let us know you're working on it to avoid duplicate effort +4. **Fork the Repository**: Create your own fork to work on +5. **Create a Branch**: Use a descriptive branch name like `feat/add-feature` or `fix/bug-description` + +## Development Setup + +```bash +git clone https://github.com/YOUR-USERNAME/code-graph-rag.git +cd code-graph-rag +make dev +``` + +This installs all dependencies and sets up pre-commit hooks automatically. + +## Pre-commit Hooks + +All commits must pass pre-commit checks. Do not skip hooks with `--no-verify`. + +```bash +pre-commit install +pre-commit autoupdate +``` + +## Running Checks Locally + +```bash +make lint # Lint check +make format # Format check +make typecheck # Type check +make test-parallel # Unit tests in parallel +make test-integration # Integration tests (requires Docker) +``` + +Or run everything at once: + +```bash +make check # Runs lint + typecheck + test +make pre-commit # Runs ALL pre-commit checks (mirrors CI) +``` + +## Pull Request Guidelines + +- Keep PRs focused on a single issue or feature +- Write clear, descriptive commit messages using Conventional Commits format +- Include tests for new functionality +- Update documentation when necessary +- Be responsive to feedback during code review + +### CI Pipeline + +All pull requests are validated by CI, which runs in parallel: + +1. **Lint & Format**: `ruff check` and `ruff format --check` +2. **Type Check**: `ty check` on production code +3. **Unit Tests**: Parallel execution with `pytest-xdist` and coverage reporting +4. **Integration Tests**: Full stack testing with Memgraph +5. **PR Title Validation**: Conventional Commits format check + +### Automated Code Review + +This project uses automated code review bots (**Greptile** and **Gemini Code Assist**). Before requesting a human review, address all bot comments by either implementing suggestions or replying with a clear justification for why a suggestion doesn't apply. + +## Technical Requirements + +- **PydanticAI Only**: Do not introduce other agentic frameworks (LangChain, CrewAI, AutoGen, etc.) +- **Heavy Pydantic Usage**: Use Pydantic models for data validation, serialization, and configuration +- **Package Management**: Use `uv` for all dependency management +- **Code Quality**: Use `ruff` for linting and formatting +- **Type Safety**: Use type hints everywhere and run `uv run ty check` + +## Development Tools + +| Tool | Purpose | +|------|---------| +| `uv` | Package manager and dependency resolver | +| `ruff` | Code linting and formatting | +| `ty` | Static type checking (from Astral) | +| `pytest` | Testing framework | +| `ripgrep` (`rg`) | Shell command text searching | + +## Comment Policy + +No inline comments are allowed unless they: + +1. Appear before any code at the top of the file +2. Contain the `(H)` marker (intentional, human-written comment) +3. Are type annotations (`type:`, `noqa`, `pyright`, `ty:`) + +## Questions? + +- Open a discussion on GitHub +- Comment on the relevant issue +- Reach out to the maintainers diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md new file mode 100644 index 000000000..b77b72d02 --- /dev/null +++ b/docs/getting-started/configuration.md @@ -0,0 +1,127 @@ +--- +description: "Configure Code-Graph-RAG with provider settings, environment variables, and model options." +--- + +# Configuration + +Configuration is managed through environment variables in the `.env` file. The provider-explicit configuration supports mixing different providers for orchestrator and cypher models. + +## Provider Examples + +### All Ollama (Local Models) + +```bash +ORCHESTRATOR_PROVIDER=ollama +ORCHESTRATOR_MODEL=llama3.2 +ORCHESTRATOR_ENDPOINT=http://localhost:11434/v1 + +CYPHER_PROVIDER=ollama +CYPHER_MODEL=codellama +CYPHER_ENDPOINT=http://localhost:11434/v1 +``` + +### All OpenAI Models + +```bash +ORCHESTRATOR_PROVIDER=openai +ORCHESTRATOR_MODEL=gpt-4o +ORCHESTRATOR_API_KEY=sk-your-openai-key + +CYPHER_PROVIDER=openai +CYPHER_MODEL=gpt-4o-mini +CYPHER_API_KEY=sk-your-openai-key +``` + +### All Google Models + +```bash +ORCHESTRATOR_PROVIDER=google +ORCHESTRATOR_MODEL=gemini-2.5-pro +ORCHESTRATOR_API_KEY=your-google-api-key + +CYPHER_PROVIDER=google +CYPHER_MODEL=gemini-2.5-flash +CYPHER_API_KEY=your-google-api-key +``` + +Get your Google API key from [Google AI Studio](https://aistudio.google.com/app/apikey). + +### Mixed Providers + +```bash +ORCHESTRATOR_PROVIDER=google +ORCHESTRATOR_MODEL=gemini-2.5-pro +ORCHESTRATOR_API_KEY=your-google-api-key + +CYPHER_PROVIDER=ollama +CYPHER_MODEL=codellama +CYPHER_ENDPOINT=http://localhost:11434/v1 +``` + +## Orchestrator Model Settings + +| Variable | Description | +|----------|-------------| +| `ORCHESTRATOR_PROVIDER` | Provider name (`google`, `openai`, `ollama`) | +| `ORCHESTRATOR_MODEL` | Model ID (e.g., `gemini-2.5-pro`, `gpt-4o`, `llama3.2`) | +| `ORCHESTRATOR_API_KEY` | API key for the provider (if required) | +| `ORCHESTRATOR_ENDPOINT` | Custom endpoint URL (if required) | +| `ORCHESTRATOR_PROJECT_ID` | Google Cloud project ID (for Vertex AI) | +| `ORCHESTRATOR_REGION` | Google Cloud region (default: `us-central1`) | +| `ORCHESTRATOR_PROVIDER_TYPE` | Google provider type (`gla` or `vertex`) | +| `ORCHESTRATOR_THINKING_BUDGET` | Thinking budget for reasoning models | +| `ORCHESTRATOR_SERVICE_ACCOUNT_FILE` | Path to service account file (for Vertex AI) | + +## Cypher Model Settings + +| Variable | Description | +|----------|-------------| +| `CYPHER_PROVIDER` | Provider name (`google`, `openai`, `ollama`) | +| `CYPHER_MODEL` | Model ID (e.g., `gemini-2.5-flash`, `gpt-4o-mini`, `codellama`) | +| `CYPHER_API_KEY` | API key for the provider (if required) | +| `CYPHER_ENDPOINT` | Custom endpoint URL (if required) | +| `CYPHER_PROJECT_ID` | Google Cloud project ID (for Vertex AI) | +| `CYPHER_REGION` | Google Cloud region (default: `us-central1`) | +| `CYPHER_PROVIDER_TYPE` | Google provider type (`gla` or `vertex`) | +| `CYPHER_THINKING_BUDGET` | Thinking budget for reasoning models | +| `CYPHER_SERVICE_ACCOUNT_FILE` | Path to service account file (for Vertex AI) | + +## System Settings + +| Variable | Default | Description | +|----------|---------|-------------| +| `MEMGRAPH_HOST` | `localhost` | Memgraph hostname | +| `MEMGRAPH_PORT` | `7687` | Memgraph port | +| `MEMGRAPH_HTTP_PORT` | `7444` | Memgraph HTTP port | +| `LAB_PORT` | `3000` | Memgraph Lab port | +| `MEMGRAPH_BATCH_SIZE` | `1000` | Batch size for Memgraph operations | +| `TARGET_REPO_PATH` | `.` | Default repository path | +| `LOCAL_MODEL_ENDPOINT` | `http://localhost:11434/v1` | Fallback endpoint for Ollama | + +## Setting Up Ollama + +```bash +curl -fsSL https://ollama.ai/install.sh | sh + +ollama pull llama3.2 +# Or try other models: +# ollama pull llama3 +# ollama pull mistral +# ollama pull codellama +``` + +Ollama automatically starts serving on `localhost:11434`. + +!!! note + Local models provide privacy and no API costs, but may have lower accuracy compared to cloud models like Gemini or GPT-4o. + +## Programmatic Configuration + +You can also configure providers programmatically via the Python SDK: + +```python +from cgr import settings + +settings.set_orchestrator("openai", "gpt-4o", api_key="sk-...") +settings.set_cypher("google", "gemini-2.5-flash", api_key="your-key") +``` diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 000000000..522d380b8 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,115 @@ +--- +description: "Install Code-Graph-RAG and set up Memgraph for multi-language codebase analysis." +--- + +# Installation + +## Prerequisites + +- Python 3.12+ +- Docker & Docker Compose (for Memgraph) +- **cmake** (required for building pymgclient dependency) +- **ripgrep** (`rg`) (required for shell command text searching) +- **For cloud models**: Google Gemini API key, OpenAI API key, or both +- **For local models**: Ollama installed and running +- `uv` package manager (recommended) or `pip` + +### Installing cmake and ripgrep + +=== "macOS" + + ```bash + brew install cmake ripgrep + ``` + +=== "Ubuntu/Debian" + + ```bash + sudo apt-get update + sudo apt-get install cmake ripgrep + ``` + +=== "CentOS/RHEL" + + ```bash + sudo yum install cmake + sudo dnf install ripgrep + ``` + + ripgrep may need to be installed from EPEL or via `cargo install ripgrep`. + +## Install from PyPI + +```bash +pip install code-graph-rag +``` + +With all Tree-sitter grammars (Python, JS, TS, Rust, Go, Java, Scala, C++, Lua): + +```bash +pip install 'code-graph-rag[treesitter-full]' +``` + +With semantic code search (UniXcoder embeddings): + +```bash +pip install 'code-graph-rag[semantic]' +``` + +With both full language support and semantic search: + +```bash +pip install 'code-graph-rag[treesitter-full,semantic]' +``` + +## Install from Source + +```bash +git clone https://github.com/vitali87/code-graph-rag.git +cd code-graph-rag +``` + +For basic Python support: + +```bash +uv sync +``` + +For full multi-language support: + +```bash +uv sync --extra treesitter-full +``` + +For development (including tests and pre-commit hooks): + +```bash +make dev +``` + +This installs all dependencies and sets up pre-commit hooks automatically. + +## Start Memgraph + +```bash +docker compose up -d +``` + +This starts the Memgraph database on port 7687 and Memgraph Lab on port 3000. + +## Set Up Environment Variables + +```bash +cp .env.example .env +# Edit .env with your configuration +``` + +See the [Configuration](configuration.md) guide for all available options. + +## Verify Your Setup + +```bash +cgr doctor +``` + +This checks that all required dependencies and services are available. diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md new file mode 100644 index 000000000..97100cc9b --- /dev/null +++ b/docs/getting-started/quickstart.md @@ -0,0 +1,103 @@ +--- +description: "Parse, query, and export your codebase in 5 minutes with Code-Graph-RAG." +--- + +# Quick Start + +Get from zero to querying your codebase in 5 minutes. + +## Step 1: Parse a Repository + +Parse and ingest a multi-language repository into the knowledge graph. + +**For the first repository (clean start):** + +```bash +cgr start --repo-path /path/to/repo1 --update-graph --clean +``` + +**For additional repositories (preserve existing data):** + +```bash +cgr start --repo-path /path/to/repo2 --update-graph +cgr start --repo-path /path/to/repo3 --update-graph +``` + +**Control Memgraph batch flushing:** + +```bash +cgr start --repo-path /path/to/repo --update-graph --batch-size 5000 +``` + +The system automatically detects and processes files for all supported languages. + +## Step 2: Query the Codebase + +Start the interactive RAG CLI: + +```bash +cgr start --repo-path /path/to/your/repo +``` + +**Specify custom models:** + +```bash +cgr start --repo-path /path/to/your/repo \ + --orchestrator ollama:llama3.2 \ + --cypher ollama:codellama +``` + +```bash +cgr start --repo-path /path/to/your/repo \ + --orchestrator google:gemini-2.0-flash-thinking-exp-01-21 \ + --cypher google:gemini-2.5-flash-lite-preview-06-17 +``` + +**Example queries:** + +- "Show me all classes that contain 'user' in their name" +- "Find functions related to database operations" +- "What methods does the User class have?" +- "Show me functions that handle authentication" +- "List all TypeScript components" +- "Find Rust structs and their methods" +- "Add logging to all database connection functions" +- "Refactor the User class to use dependency injection" + +## Step 3: Export Graph Data + +**Export during graph update:** + +```bash +cgr start --repo-path /path/to/repo --update-graph --clean -o my_graph.json +``` + +**Export existing graph without updating:** + +```bash +cgr export -o my_graph.json +``` + +**Work with exported data in Python:** + +```python +from codebase_rag.graph_loader import load_graph + +graph = load_graph("my_graph.json") +summary = graph.summary() +print(f"Total nodes: {summary['total_nodes']}") +print(f"Total relationships: {summary['total_relationships']}") + +functions = graph.find_nodes_by_label("Function") +for func in functions[:5]: + relationships = graph.get_relationships_for_node(func.node_id) + print(f"Function {func.properties['name']} has {len(relationships)} relationships") +``` + +## What Next? + +- [CLI Reference](../guide/cli-reference.md) for all available commands +- [Interactive Querying](../guide/interactive-querying.md) for query examples +- [Code Optimization](../guide/code-optimization.md) for AI-powered improvements +- [MCP Server](../guide/mcp-server.md) for Claude Code integration +- [Python SDK](../sdk/overview.md) for programmatic access diff --git a/docs/guide/cli-reference.md b/docs/guide/cli-reference.md new file mode 100644 index 000000000..6c5842703 --- /dev/null +++ b/docs/guide/cli-reference.md @@ -0,0 +1,111 @@ +--- +description: "Complete CLI reference for Code-Graph-RAG commands and Makefile targets." +--- + +# CLI Reference + +The `cgr` command is the main entry point for Code-Graph-RAG. + +## Core Commands + +### `cgr start` + +Parse a repository and/or start the interactive query CLI. + +```bash +cgr start --repo-path /path/to/repo [OPTIONS] +``` + +| Option | Description | +|--------|-------------| +| `--repo-path` | Path to repository (defaults to current directory) | +| `--update-graph` | Parse and ingest the repository into the knowledge graph | +| `--clean` | Clear existing data before ingesting | +| `--batch-size` | Override Memgraph flush batch size | +| `--orchestrator` | Specify provider:model for main operations (e.g., `google:gemini-2.5-pro`, `ollama:llama3.2`) | +| `--cypher` | Specify provider:model for graph queries (e.g., `google:gemini-2.5-flash`, `ollama:codellama`) | +| `-o` | Export graph to JSON file during update | + +### `cgr export` + +Export the knowledge graph to JSON. + +```bash +cgr export -o my_graph.json +``` + +### `cgr optimize` + +AI-powered codebase optimization. + +```bash +cgr optimize --repo-path /path/to/repo [OPTIONS] +``` + +| Option | Description | +|--------|-------------| +| `--repo-path` | Path to repository | +| `--orchestrator` | Specify provider:model for operations | +| `--batch-size` | Override Memgraph flush batch size | +| `--reference-document` | Path to reference documentation for guided optimization | + +Supported languages: `python`, `javascript`, `typescript`, `rust`, `go`, `java`, `scala`, `cpp` + +### `cgr mcp-server` + +Start the MCP server for Claude Code integration. + +```bash +cgr mcp-server +``` + +### `cgr index` + +Index a repository to protobuf for offline use. + +```bash +cgr index -o ./index-output --repo-path ./my-project +``` + +### `cgr doctor` + +Check that all required dependencies and services are available. + +```bash +cgr doctor +``` + +### `cgr language` + +Manage language support. + +```bash +cgr language add-grammar +cgr language add-grammar --grammar-url +cgr language list-languages +cgr language remove-language +``` + +## Makefile Commands + +| Command | Description | +|---------|-------------| +| `make help` | Show help message | +| `make all` | Install everything for full development environment | +| `make install` | Install project dependencies with full language support | +| `make python` | Install project dependencies for Python only | +| `make dev` | Setup development environment (install deps + pre-commit hooks) | +| `make test` | Run unit tests only (fast, no Docker) | +| `make test-parallel` | Run unit tests in parallel (fast, no Docker) | +| `make test-integration` | Run integration tests (requires Docker) | +| `make test-all` | Run all tests including integration and e2e (requires Docker) | +| `make test-parallel-all` | Run all tests in parallel (requires Docker) | +| `make clean` | Clean up build artifacts and cache | +| `make build-grammars` | Build grammar submodules | +| `make watch` | Watch repository for changes and update graph in real-time | +| `make readme` | Regenerate README.md from codebase | +| `make lint` | Run ruff check | +| `make format` | Run ruff format | +| `make typecheck` | Run type checking with ty | +| `make check` | Run all checks: lint, typecheck, test | +| `make pre-commit` | Run all pre-commit checks locally | diff --git a/docs/guide/code-optimization.md b/docs/guide/code-optimization.md new file mode 100644 index 000000000..77b7e6698 --- /dev/null +++ b/docs/guide/code-optimization.md @@ -0,0 +1,91 @@ +--- +description: "AI-powered codebase optimization with language-specific best practices and interactive approval." +--- + +# Code Optimization + +Code-Graph-RAG provides AI-powered codebase optimization with best practices guidance and an interactive approval workflow. + +## Basic Usage + +```bash +cgr optimize python --repo-path /path/to/your/repo +``` + +## With Reference Documentation + +Guide the optimization process using your own coding standards: + +```bash +cgr optimize python \ + --repo-path /path/to/your/repo \ + --reference-document /path/to/best_practices.md +``` + +```bash +cgr optimize java \ + --reference-document ./ARCHITECTURE.md +``` + +```bash +cgr optimize rust \ + --reference-document ./docs/performance_guide.md +``` + +The agent incorporates guidance from your reference documents when suggesting optimizations, ensuring they align with your project's standards and architectural decisions. + +## Using Specific Models + +```bash +cgr optimize javascript \ + --repo-path /path/to/frontend \ + --orchestrator google:gemini-2.0-flash-thinking-exp-01-21 +``` + +```bash +cgr optimize javascript --repo-path /path/to/frontend \ + --batch-size 5000 +``` + +## Supported Languages + +All supported languages: `python`, `javascript`, `typescript`, `rust`, `go`, `java`, `scala`, `cpp` + +## How It Works + +1. **Analysis Phase**: The agent analyzes your codebase structure using the knowledge graph +2. **Pattern Recognition**: Identifies common anti-patterns, performance issues, and improvement opportunities +3. **Best Practices Application**: Applies language-specific best practices and patterns +4. **Interactive Approval**: Presents each optimization suggestion for your approval before implementation +5. **Guided Implementation**: Implements approved changes with detailed explanations + +## Example Session + +``` +Starting python optimization session... +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ The agent will analyze your python codebase and propose specific ┃ +┃ optimizations. You'll be asked to approve each suggestion before ┃ +┃ implementation. Type 'exit' or 'quit' to end the session. ┃ +┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ + +Analyzing codebase structure... +Found 23 Python modules with potential optimizations + +Optimization Suggestion #1: + File: src/data_processor.py + Issue: Using list comprehension in a loop can be optimized + Suggestion: Replace with generator expression for memory efficiency + + [y/n] Do you approve this optimization? +``` + +## CLI Options + +| Option | Description | +|--------|-------------| +| `--orchestrator` | Specify provider:model for main operations | +| `--cypher` | Specify provider:model for graph queries | +| `--repo-path` | Path to repository (defaults to current directory) | +| `--batch-size` | Override Memgraph flush batch size | +| `--reference-document` | Path to reference documentation | diff --git a/docs/guide/graph-export.md b/docs/guide/graph-export.md new file mode 100644 index 000000000..814321dd0 --- /dev/null +++ b/docs/guide/graph-export.md @@ -0,0 +1,63 @@ +--- +description: "Export the Code-Graph-RAG knowledge graph to JSON for programmatic analysis and integration." +--- + +# Graph Export + +Export the entire knowledge graph to JSON for programmatic access and integration with other tools. + +## Export Commands + +**Export during graph update:** + +```bash +cgr start --repo-path /path/to/repo --update-graph --clean -o my_graph.json +``` + +**Export existing graph without updating:** + +```bash +cgr export -o my_graph.json +``` + +**Adjust Memgraph batching during export:** + +```bash +cgr export -o my_graph.json --batch-size 5000 +``` + +## Working with Exported Data + +```python +from codebase_rag.graph_loader import load_graph + +graph = load_graph("my_graph.json") + +summary = graph.summary() +print(f"Total nodes: {summary['total_nodes']}") +print(f"Total relationships: {summary['total_relationships']}") + +functions = graph.find_nodes_by_label("Function") +classes = graph.find_nodes_by_label("Class") + +for func in functions[:5]: + relationships = graph.get_relationships_for_node(func.node_id) + print(f"Function {func.properties['name']} has {len(relationships)} relationships") +``` + +## Example Analysis Script + +```bash +python examples/graph_export_example.py my_graph.json +``` + +## Use Cases + +Exported graph data is useful for: + +- Integration with other tools +- Custom analysis scripts +- Building documentation generators +- Creating code metrics dashboards + +See the [Python SDK](../sdk/overview.md) for more programmatic access patterns. diff --git a/docs/guide/interactive-querying.md b/docs/guide/interactive-querying.md new file mode 100644 index 000000000..5f3dd983b --- /dev/null +++ b/docs/guide/interactive-querying.md @@ -0,0 +1,89 @@ +--- +description: "Query your codebase with natural language using Code-Graph-RAG's interactive CLI." +--- + +# Interactive Querying + +Code-Graph-RAG lets you ask questions about your codebase in plain English. The system translates your questions into Cypher queries, executes them against the knowledge graph, and returns relevant results with source code snippets. + +## Starting the CLI + +```bash +cgr start --repo-path /path/to/your/repo +``` + +## Example Queries + +### Finding Code Elements + +- "Show me all classes that contain 'user' in their name" +- "Find functions related to database operations" +- "What methods does the User class have?" +- "Show me functions that handle authentication" +- "List all TypeScript components" +- "Find Rust structs and their methods" +- "Show me Go interfaces and implementations" + +### Analyzing Relationships + +- "Find all functions that call each other" +- "What classes are in the user module" +- "Show me functions with the longest call chains" +- "What functions call UserService.create_user?" +- "Show me all classes that implement the Repository interface" + +### C++ Specific Queries + +- "Find all C++ operator overloads in the Matrix class" +- "Show me C++ template functions with their specializations" +- "List all C++ namespaces and their contained classes" +- "Find C++ lambda expressions used in algorithms" + +### Code Editing Queries + +- "Add logging to all database connection functions" +- "Refactor the User class to use dependency injection" +- "Convert these Python functions to async/await pattern" +- "Add error handling to authentication methods" +- "Optimize this function for better performance" + +## Semantic Code Search + +Search for functions by describing what they do, rather than by exact names: + +- "error handling functions" +- "authentication code" +- "database connection setup" + +Semantic search uses UniXcoder embeddings and requires the `semantic` extra: + +```bash +pip install 'code-graph-rag[semantic]' +``` + +## Agentic Tools + +The interactive agent has access to these tools: + +| Tool | Description | +|------|-------------| +| `query_graph` | Query the knowledge graph using natural language | +| `read_file` | Read the content of text-based files | +| `create_file` | Create a new file with content | +| `replace_code` | Surgically replace specific code blocks | +| `list_directory` | List directory contents | +| `analyze_document` | Analyze documents (PDFs, images) | +| `execute_shell` | Execute shell commands from allowlist | +| `semantic_search` | Semantic function search by description | +| `get_function_source` | Retrieve source code by node ID | +| `get_code_snippet` | Retrieve source code by qualified name | + +## Intelligent File Editing + +The agent uses AST-based function targeting with Tree-sitter for precise code modifications: + +- **Visual diff preview** before changes +- **Surgical patching** that only modifies target code blocks +- **Multi-language support** across all supported languages +- **Security sandbox** preventing edits outside project directory +- **Smart function matching** with qualified names and line numbers diff --git a/docs/guide/mcp-server.md b/docs/guide/mcp-server.md new file mode 100644 index 000000000..96be4598a --- /dev/null +++ b/docs/guide/mcp-server.md @@ -0,0 +1,140 @@ +--- +description: "Integrate Code-Graph-RAG with Claude Code as an MCP server for natural language codebase analysis." +--- + +# MCP Server (Claude Code Integration) + +Code-Graph-RAG can run as an MCP (Model Context Protocol) server, enabling seamless integration with Claude Code and other MCP clients. + +## Quick Setup + +**If installed via pip** (and `code-graph-rag` is on your PATH): + +```bash +claude mcp add --transport stdio code-graph-rag \ + --env TARGET_REPO_PATH=/absolute/path/to/your/project \ + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=your-api-key \ + -- code-graph-rag mcp-server +``` + +**If installed from source:** + +```bash +claude mcp add --transport stdio code-graph-rag \ + --env TARGET_REPO_PATH=/absolute/path/to/your/project \ + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=your-api-key \ + -- uv run --directory /path/to/code-graph-rag code-graph-rag mcp-server +``` + +### Using Current Directory + +```bash +cd /path/to/your/project + +claude mcp add --transport stdio code-graph-rag \ + --env TARGET_REPO_PATH="$(pwd)" \ + --env CYPHER_PROVIDER=google \ + --env CYPHER_MODEL=gemini-2.0-flash \ + --env CYPHER_API_KEY=your-google-api-key \ + -- uv run --directory /absolute/path/to/code-graph-rag code-graph-rag mcp-server +``` + +## Prerequisites + +```bash +git clone https://github.com/vitali87/code-graph-rag.git +cd code-graph-rag +uv sync + +docker run -p 7687:7687 -p 7444:7444 memgraph/memgraph-platform +``` + +## Available Tools + +| Tool | Description | +|------|-------------| +| `list_projects` | List all indexed projects in the knowledge graph database | +| `delete_project` | Delete a specific project from the knowledge graph database | +| `wipe_database` | Completely wipe the entire database (cannot be undone) | +| `index_repository` | Parse and ingest the repository into the knowledge graph | +| `query_code_graph` | Query the codebase knowledge graph using natural language | +| `get_code_snippet` | Retrieve source code for a function, class, or method by qualified name | +| `surgical_replace_code` | Surgically replace an exact code block using diff-match-patch | +| `read_file` | Read file contents with pagination support | +| `write_file` | Write content to a file | +| `list_directory` | List directory contents | + +## Example Usage + +``` +> Index this repository +> What functions call UserService.create_user? +> Update the login function to add rate limiting +``` + +## LLM Provider Options + +=== "OpenAI" + + ```bash + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=sk-... + ``` + +=== "Google Gemini" + + ```bash + --env CYPHER_PROVIDER=google \ + --env CYPHER_MODEL=gemini-2.5-flash \ + --env CYPHER_API_KEY=... + ``` + +=== "Ollama (free, local)" + + ```bash + --env CYPHER_PROVIDER=ollama \ + --env CYPHER_MODEL=llama3.2 + ``` + +## Multi-Repository Setup + +Add separate named instances for different projects: + +```bash +claude mcp add --transport stdio code-graph-rag-backend \ + --env TARGET_REPO_PATH=/path/to/backend \ + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=your-api-key \ + -- uv run --directory /path/to/code-graph-rag code-graph-rag mcp-server + +claude mcp add --transport stdio code-graph-rag-frontend \ + --env TARGET_REPO_PATH=/path/to/frontend \ + --env CYPHER_PROVIDER=openai \ + --env CYPHER_MODEL=gpt-4 \ + --env CYPHER_API_KEY=your-api-key \ + -- uv run --directory /path/to/code-graph-rag code-graph-rag mcp-server +``` + +!!! warning + Only one repository can be indexed at a time per MCP instance. When you index a new repository, the previous repository's data is automatically cleared. + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Can't find uv/code-graph-rag | Use absolute paths from `which uv` | +| Wrong repository analyzed | Set `TARGET_REPO_PATH` to an absolute path | +| Memgraph connection failed | Ensure `docker ps` shows Memgraph running | +| Tools not showing | Run `claude mcp list` to verify installation | + +## Remove + +```bash +claude mcp remove code-graph-rag +``` diff --git a/docs/guide/realtime-updates.md b/docs/guide/realtime-updates.md new file mode 100644 index 000000000..9516eea31 --- /dev/null +++ b/docs/guide/realtime-updates.md @@ -0,0 +1,62 @@ +--- +description: "Keep your Code-Graph-RAG knowledge graph synchronized with code changes using the real-time file watcher." +--- + +# Real-Time Graph Updates + +For active development, keep your knowledge graph automatically synchronized with code changes using the real-time updater. + +## What It Does + +- Watches your repository for file changes (create, modify, delete) +- Automatically updates the knowledge graph in real-time +- Maintains consistency by recalculating all function call relationships +- Filters out irrelevant files (`.git`, `node_modules`, etc.) + +## Usage + +Run the real-time updater in a separate terminal: + +```bash +python realtime_updater.py /path/to/your/repo +``` + +Or using the Makefile: + +```bash +make watch REPO_PATH=/path/to/your/repo +``` + +### With Custom Memgraph Settings + +```bash +python realtime_updater.py /path/to/your/repo \ + --host localhost --port 7687 --batch-size 1000 +``` + +```bash +make watch REPO_PATH=/path/to/your/repo HOST=localhost PORT=7687 BATCH_SIZE=1000 +``` + +## Multi-Terminal Workflow + +```bash +# Terminal 1: Start the real-time updater +python realtime_updater.py ~/my-project + +# Terminal 2: Run the AI assistant +cgr start --repo-path ~/my-project +``` + +## CLI Arguments + +| Argument | Required | Default | Description | +|----------|----------|---------|-------------| +| `repo_path` | Yes | | Path to repository to watch | +| `--host` | No | `localhost` | Memgraph host | +| `--port` | No | `7687` | Memgraph port | +| `--batch-size` | No | | Number of buffered nodes/relationships before flushing to Memgraph | + +## Performance Note + +The updater currently recalculates all CALLS relationships on every file change to ensure consistency. This prevents "island" problems where changes in one file aren't reflected in relationships from other files, but may impact performance on very large codebases with frequent changes. Optimization of this behavior is a work in progress. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..c62861c38 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,48 @@ +--- +description: "Graph-based RAG system that parses multi-language codebases with Tree-sitter, builds knowledge graphs, and enables natural language querying, editing, and optimization." +--- + +# Code-Graph-RAG + +**The ultimate RAG for your monorepo.** Query, understand, and edit multi-language codebases with the power of AI and knowledge graphs. + +

+ Code-Graph-RAG Demo +

+ +## What is Code-Graph-RAG? + +Code-Graph-RAG is an accurate Retrieval-Augmented Generation (RAG) system that analyzes multi-language codebases using Tree-sitter, builds comprehensive knowledge graphs in Memgraph, and enables natural language querying of codebase structure and relationships as well as editing capabilities. + +## Key Features + +- **Multi-Language Support** for Python, TypeScript, JavaScript, Rust, Java, C++, Go, Lua, and more +- **Tree-sitter Parsing** for robust, language-agnostic AST analysis +- **Knowledge Graph Storage** using Memgraph for interconnected codebase structure +- **Natural Language Querying** to ask questions about your code in plain English +- **AI-Powered Cypher Generation** with Google Gemini, OpenAI, and Ollama support +- **Code Snippet Retrieval** with actual source code for found functions and methods +- **Advanced File Editing** with AST-based function targeting and visual diff previews +- **Shell Command Execution** for running tests and CLI tools +- **Interactive Code Optimization** with language-specific best practices +- **Reference-Guided Optimization** using your own coding standards +- **Dependency Analysis** from `pyproject.toml` +- **Semantic Code Search** using UniXcoder embeddings to find functions by intent +- **MCP Server Integration** for seamless use with Claude Code +- **Real-Time Graph Updates** via file watcher for active development + +## Quick Start + +```bash +pip install code-graph-rag +docker compose up -d +cgr start --repo-path ./my-project --update-graph --clean +``` + +See the [Installation](getting-started/installation.md) guide for full setup instructions. + +## Enterprise Services + +Code-Graph-RAG is open source and free to use. For organizations that need more, we offer **fully managed cloud-hosted solutions** and **on-premise deployments**. + +[View plans & pricing at code-graph-rag.com](https://code-graph-rag.com/enterprise){ .md-button } diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 000000000..528edb714 --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,30 @@ +{% extends "base.html" %} + +{% block extrahead %} + +{% endblock %} diff --git a/docs/sdk/cypher-generator.md b/docs/sdk/cypher-generator.md new file mode 100644 index 000000000..b9ef63613 --- /dev/null +++ b/docs/sdk/cypher-generator.md @@ -0,0 +1,47 @@ +--- +description: "Generate Cypher queries from natural language using Code-Graph-RAG's CypherGenerator." +--- + +# Cypher Generator + +The `CypherGenerator` translates natural language questions into Cypher queries for the knowledge graph. + +## Usage + +```python +import asyncio +from cgr import CypherGenerator + +async def main(): + gen = CypherGenerator() + cypher = await gen.generate("Find all classes that inherit from BaseModel") + print(cypher) + +asyncio.run(main()) +``` + +## Configuration + +The Cypher generator uses the configured Cypher provider. Set it via environment variables: + +```bash +CYPHER_PROVIDER=google +CYPHER_MODEL=gemini-2.5-flash +CYPHER_API_KEY=your-api-key +``` + +Or programmatically: + +```python +from cgr import settings + +settings.set_cypher("google", "gemini-2.5-flash", api_key="your-key") +``` + +## Supported Providers + +| Provider | Example Models | +|----------|---------------| +| Google | `gemini-2.5-pro`, `gemini-2.5-flash` | +| OpenAI | `gpt-4o`, `gpt-4o-mini` | +| Ollama | `codellama`, `llama3.2` | diff --git a/docs/sdk/graph-loader.md b/docs/sdk/graph-loader.md new file mode 100644 index 000000000..f14df3a90 --- /dev/null +++ b/docs/sdk/graph-loader.md @@ -0,0 +1,73 @@ +--- +description: "Load and query exported Code-Graph-RAG knowledge graphs with the Python SDK." +--- + +# Graph Loader + +The `load_graph` function loads exported JSON graph data for programmatic analysis. + +## Export a Graph + +First, export the knowledge graph to JSON: + +```bash +cgr export -o my_graph.json +``` + +Or export during graph update: + +```bash +cgr start --repo-path /path/to/repo --update-graph --clean -o my_graph.json +``` + +## Load and Query + +```python +from cgr import load_graph + +graph = load_graph("my_graph.json") +``` + +### Summary Statistics + +```python +summary = graph.summary() +print(f"Total nodes: {summary['total_nodes']}") +print(f"Total relationships: {summary['total_relationships']}") +``` + +### Find Nodes by Label + +```python +functions = graph.find_nodes_by_label("Function") +classes = graph.find_nodes_by_label("Class") +modules = graph.find_nodes_by_label("Module") +``` + +### Analyze Relationships + +```python +for func in functions[:5]: + relationships = graph.get_relationships_for_node(func.node_id) + print(f"Function {func.properties['name']} has {len(relationships)} relationships") +``` + +## Query Memgraph Directly + +For live queries against a running Memgraph instance: + +```python +from cgr import MemgraphIngestor + +with MemgraphIngestor(host="localhost", port=7687) as db: + rows = db.fetch_all("MATCH (f:Function) RETURN f.name LIMIT 10") + for row in rows: + print(row) +``` + +## Use Cases + +- Integration with other tools +- Custom analysis scripts +- Building documentation generators +- Creating code metrics dashboards diff --git a/docs/sdk/overview.md b/docs/sdk/overview.md new file mode 100644 index 000000000..8a4a88918 --- /dev/null +++ b/docs/sdk/overview.md @@ -0,0 +1,58 @@ +--- +description: "Python SDK overview for Code-Graph-RAG programmatic access." +--- + +# Python SDK Overview + +The `cgr` package provides short imports for programmatic use of Code-Graph-RAG. + +## Installation + +```bash +pip install code-graph-rag +``` + +With semantic code search: + +```bash +pip install 'code-graph-rag[semantic]' +``` + +## Quick Example + +```python +from cgr import load_graph + +graph = load_graph("graph.json") +print(graph.summary()) + +functions = graph.find_nodes_by_label("Function") +for fn in functions[:5]: + rels = graph.get_relationships_for_node(fn.node_id) + print(f"{fn.properties['name']}: {len(rels)} relationships") +``` + +## Available Modules + +| Import | Purpose | +|--------|---------| +| `from cgr import load_graph` | Load and query exported graph data | +| `from cgr import MemgraphIngestor` | Query Memgraph with Cypher directly | +| `from cgr import CypherGenerator` | Generate Cypher from natural language | +| `from cgr import embed_code` | Semantic code search with UniXcoder | +| `from cgr import settings` | Configure providers programmatically | + +## Configuration + +```python +from cgr import settings + +settings.set_orchestrator("openai", "gpt-4o", api_key="sk-...") +settings.set_cypher("google", "gemini-2.5-flash", api_key="your-key") +``` + +See individual pages for detailed API usage: + +- [Graph Loader](graph-loader.md) +- [Cypher Generator](cypher-generator.md) +- [Semantic Search](semantic-search.md) diff --git a/docs/sdk/semantic-search.md b/docs/sdk/semantic-search.md new file mode 100644 index 000000000..ac4393b32 --- /dev/null +++ b/docs/sdk/semantic-search.md @@ -0,0 +1,40 @@ +--- +description: "Semantic code search with UniXcoder embeddings in Code-Graph-RAG." +--- + +# Semantic Search + +Code-Graph-RAG supports intent-based code search using UniXcoder embeddings. Find functions by describing what they do rather than by exact names. + +## Installation + +Semantic search requires the `semantic` extra: + +```bash +pip install 'code-graph-rag[semantic]' +``` + +## Usage + +### Generate Code Embeddings + +```python +from cgr import embed_code + +embedding = embed_code("def authenticate(user, password): ...") +print(f"Embedding dimension: {len(embedding)}") +``` + +### Search by Description + +In the interactive CLI, you can search semantically: + +- "error handling functions" +- "authentication code" +- "database connection setup" + +The system returns potential matches with similarity scores. + +## How It Works + +UniXcoder is a unified cross-modal pre-trained model that supports both code understanding and generation. Code-Graph-RAG uses it to create embeddings that capture the semantic meaning of code, enabling searches based on what code does rather than what it's named. diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 000000000..e9e4cc5f4 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,337 @@ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500;600&display=swap'); + +:root { + --cgr-bg: #030712; + --cgr-surface: #111827; + --cgr-surface-lighter: #1f2937; + --cgr-brand: #6366f1; + --cgr-brand-light: #818cf8; + --cgr-brand-dark: #4f46e5; + --cgr-gray-50: #f9fafb; + --cgr-gray-400: #99a1af; + --cgr-gray-500: #6a7282; + --cgr-gray-800: #1e2939; + --cgr-indigo-700: #432dd7; +} + +/* Dark mode */ +[data-md-color-scheme="slate"] { + --md-default-bg-color: var(--cgr-bg); + --md-default-fg-color: var(--cgr-gray-50); + --md-default-fg-color--light: var(--cgr-gray-400); + --md-default-fg-color--lighter: var(--cgr-gray-500); + --md-default-fg-color--lightest: var(--cgr-gray-800); + --md-primary-fg-color: var(--cgr-brand); + --md-primary-fg-color--light: var(--cgr-brand-light); + --md-primary-fg-color--dark: var(--cgr-brand-dark); + --md-primary-bg-color: var(--cgr-gray-50); + --md-primary-bg-color--light: var(--cgr-gray-400); + --md-accent-fg-color: var(--cgr-brand-light); + --md-accent-fg-color--transparent: rgba(129, 140, 248, 0.1); + --md-accent-bg-color: var(--cgr-brand); + --md-code-bg-color: var(--cgr-surface); + --md-code-fg-color: #e2e8f0; + --md-code-hl-color: var(--cgr-surface-lighter); + --md-code-hl-number-color: #fbbf24; + --md-code-hl-string-color: #34d399; + --md-code-hl-keyword-color: #c084fc; + --md-code-hl-function-color: #60a5fa; + --md-code-hl-comment-color: var(--cgr-gray-500); + --md-code-hl-constant-color: #f472b6; + --md-code-hl-operator-color: #fbbf24; + --md-code-hl-punctuation-color: var(--cgr-gray-400); + --md-code-hl-special-color: #fb923c; + --md-code-hl-name-color: var(--cgr-gray-50); + --md-code-hl-generic-color: var(--cgr-gray-50); + --md-code-hl-variable-color: #f9fafb; + --md-footer-bg-color: var(--cgr-bg); + --md-footer-bg-color--dark: var(--cgr-bg); + --md-footer-fg-color: var(--cgr-gray-400); + --md-footer-fg-color--light: var(--cgr-gray-500); + --md-footer-fg-color--lighter: var(--cgr-gray-500); + --md-typeset-a-color: var(--cgr-brand-light); + --md-typeset-color: var(--cgr-gray-50); + --md-typeset-table-color: rgba(99, 102, 241, 0.05); + --md-typeset-table-color--light: rgba(99, 102, 241, 0.02); + --md-admonition-bg-color: var(--cgr-surface); + --md-shadow-z1: 0 0 0 transparent; + --md-shadow-z2: 0 0 0 transparent; + --md-shadow-z3: 0 0 0 transparent; +} + +[data-md-color-scheme="slate"] .md-header, +[data-md-color-scheme="slate"] .md-tabs { + background-color: var(--cgr-surface); + border-bottom: 1px solid var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-tabs__link { + color: var(--cgr-gray-400); + opacity: 1; + transition: color 0.2s ease; +} + +[data-md-color-scheme="slate"] .md-tabs__link:hover { + color: var(--cgr-gray-50); +} + +[data-md-color-scheme="slate"] .md-tabs__link--active { + color: var(--cgr-brand-light); +} + +[data-md-color-scheme="slate"] .md-nav--primary .md-nav__item--active > .md-nav__link { + color: var(--cgr-brand-light); +} + +[data-md-color-scheme="slate"] .md-sidebar { + background-color: var(--cgr-bg); +} + +[data-md-color-scheme="slate"] .md-nav__link { + color: var(--cgr-gray-400); + transition: color 0.2s ease; +} + +[data-md-color-scheme="slate"] .md-nav__link:hover { + color: var(--cgr-gray-50); +} + +[data-md-color-scheme="slate"] .md-nav__link--active { + color: var(--cgr-brand-light); + font-weight: 500; +} + +[data-md-color-scheme="slate"] .md-search__form { + background-color: var(--cgr-surface); + border: 1px solid var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-search__input::placeholder { + color: var(--cgr-gray-500); +} + +[data-md-color-scheme="slate"] .md-typeset code { + background-color: var(--cgr-surface); + border: 1px solid var(--cgr-gray-800); + color: var(--cgr-brand-light); +} + +[data-md-color-scheme="slate"] .md-typeset .admonition, +[data-md-color-scheme="slate"] .md-typeset details { + background-color: var(--cgr-surface); + border-color: var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-typeset .md-typeset__table table { + border: 1px solid var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-typeset .md-typeset__table th { + background-color: var(--cgr-surface); + border-color: var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-typeset .md-typeset__table td { + border-color: var(--cgr-gray-800); +} + +[data-md-color-scheme="slate"] .md-typeset hr { + border-color: var(--cgr-gray-800); +} + +/* Light mode */ +[data-md-color-scheme="default"] { + --md-primary-fg-color: var(--cgr-brand-dark); + --md-primary-fg-color--light: var(--cgr-brand); + --md-primary-fg-color--dark: var(--cgr-indigo-700); + --md-primary-bg-color: #ffffff; + --md-accent-fg-color: var(--cgr-brand); + --md-accent-fg-color--transparent: rgba(99, 102, 241, 0.1); + --md-typeset-a-color: var(--cgr-brand-dark); + --md-code-bg-color: #f8f9fc; + --md-code-fg-color: #1e293b; + --md-code-hl-color: rgba(99, 102, 241, 0.08); + --md-code-hl-number-color: #b45309; + --md-code-hl-string-color: #059669; + --md-code-hl-keyword-color: #7c3aed; + --md-code-hl-function-color: #2563eb; + --md-code-hl-comment-color: #9ca3af; + --md-shadow-z1: 0 0 0 transparent; + --md-shadow-z2: 0 1px 3px rgba(0, 0, 0, 0.08); +} + +[data-md-color-scheme="default"] .md-header { + background-color: #ffffff; + border-bottom: 1px solid #e5e7eb; + color: #1e293b; +} + +[data-md-color-scheme="default"] .md-header .md-header__title { + color: #1e293b; +} + +[data-md-color-scheme="default"] .md-header .md-header__topic { + color: #1e293b; +} + +[data-md-color-scheme="default"] .md-header .md-header__button { + color: #475569; +} + +[data-md-color-scheme="default"] .md-tabs { + background-color: #ffffff; + border-bottom: 1px solid #e5e7eb; +} + +[data-md-color-scheme="default"] .md-tabs__link { + color: #64748b; + opacity: 1; +} + +[data-md-color-scheme="default"] .md-tabs__link:hover { + color: #1e293b; +} + +[data-md-color-scheme="default"] .md-tabs__link--active { + color: var(--cgr-brand-dark); +} + +[data-md-color-scheme="default"] .md-typeset code { + background-color: #f1f5f9; + border: 1px solid #e2e8f0; + color: var(--cgr-brand-dark); +} + +[data-md-color-scheme="default"] .md-search__form { + background-color: #f1f5f9; + border: 1px solid #e2e8f0; +} + +/* Shared styles */ +.md-typeset { + font-family: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + font-size: 0.82rem; + line-height: 1.7; +} + +.md-typeset code, +.md-typeset pre, +.md-typeset kbd { + font-family: "JetBrains Mono", "SF Mono", "Cascadia Code", "Fira Code", monospace; + font-size: 0.82em; +} + +.md-typeset h1 { + font-weight: 700; + letter-spacing: -0.02em; +} + +.md-typeset h2 { + font-weight: 600; + letter-spacing: -0.01em; +} + +.md-typeset h3, +.md-typeset h4 { + font-weight: 600; +} + +.md-typeset a { + transition: color 0.2s ease; +} + +[data-md-color-scheme="slate"] .md-typeset a:hover { + color: var(--cgr-brand-light); +} + +[data-md-color-scheme="default"] .md-typeset a:hover { + color: var(--cgr-indigo-700); +} + +.md-header__title, +.md-tabs__link, +.md-nav__link, +.md-button, +.md-typeset .admonition-title, +.md-typeset summary, +.md-footer, +.md-typeset table:not([class]) th { + font-family: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; +} + +.md-header__title { + font-weight: 600; +} + +.md-tabs__link { + font-weight: 500; + font-size: 0.78rem; + letter-spacing: 0.01em; +} + +.md-nav__link { + font-size: 0.76rem; +} + +.md-button { + font-weight: 500; + border-radius: 8px; + padding: 0.6em 1.4em; + transition: background-color 0.2s ease, border-color 0.2s ease, transform 0.2s ease; +} + +.md-button--primary { + background-color: var(--cgr-brand); + border-color: var(--cgr-brand); + color: #ffffff; +} + +.md-button--primary:hover { + background-color: var(--cgr-brand-dark); + border-color: var(--cgr-brand-dark); + color: #ffffff; +} + +.md-typeset .md-button:hover { + transform: translateY(-1px); +} + +.md-content { + max-width: 52rem; +} + +.md-typeset pre > code { + border-radius: 8px; +} + +.md-typeset .admonition, +.md-typeset details { + border-radius: 8px; + border-width: 1px; + border-left-width: 4px; +} + +.md-typeset .admonition-title, +.md-typeset summary { + font-weight: 600; +} + +.md-search__form { + border-radius: 8px; +} + +.md-typeset table:not([class]) { + font-size: 0.8rem; + border-radius: 8px; + overflow: hidden; +} + +.md-typeset table:not([class]) th { + font-weight: 600; +} + +@media screen and (min-width: 76.25em) { + .md-sidebar--primary { + width: 13rem; + } +} diff --git a/funding.json b/funding.json new file mode 100644 index 000000000..baa0c096c --- /dev/null +++ b/funding.json @@ -0,0 +1,108 @@ +{ + "$schema": "https://fundingjson.org/schema/v1.1.0.json", + "version": "v1.1.0", + "entity": { + "type": "individual", + "role": "owner", + "name": "Vitali Avagyan", + "email": "eheva87@gmail.com", + "description": "Creator and maintainer of Code-Graph-RAG, an open source tool for AI-powered codebase understanding via knowledge graphs.", + "webpageUrl": { + "url": "https://code-graph-rag.com" + } + }, + "projects": [ + { + "guid": "code-graph-rag", + "name": "Code-Graph-RAG", + "description": "An open source retrieval-augmented generation system that analyzes multi-language codebases using Tree-sitter, builds comprehensive knowledge graphs, and enables natural language querying and editing of codebase structure and relationships. Supports 11 programming languages with a unified graph schema and functions as an MCP server for AI assistant integration.", + "webpageUrl": { + "url": "https://code-graph-rag.com" + }, + "repositoryUrl": { + "url": "https://github.com/vitali87/code-graph-rag" + }, + "licenses": [ + "spdx:MIT" + ], + "tags": [ + "rag", + "knowledge-graph", + "code-analysis", + "tree-sitter", + "mcp-server", + "developer-tools", + "ai", + "graph-database", + "semantic-search", + "python" + ] + } + ], + "funding": { + "channels": [ + { + "guid": "github-sponsors", + "type": "payment-provider", + "address": "https://github.com/sponsors/vitali87", + "description": "GitHub Sponsors" + }, + { + "guid": "buy-me-a-coffee", + "type": "payment-provider", + "address": "https://buymeacoffee.com/vitali87", + "description": "Buy Me a Coffee" + } + ], + "plans": [ + { + "guid": "one-time-any", + "status": "active", + "name": "One-time donation", + "description": "Support Code-Graph-RAG development with a one-time contribution of any amount.", + "amount": 0, + "currency": "USD", + "frequency": "one-time", + "channels": [ + "github-sponsors", + "buy-me-a-coffee" + ] + }, + { + "guid": "monthly-supporter", + "status": "active", + "name": "Monthly supporter", + "description": "Recurring monthly support for ongoing development, security maintenance, and new language support.", + "amount": 0, + "currency": "USD", + "frequency": "monthly", + "channels": [ + "github-sponsors", + "buy-me-a-coffee" + ] + }, + { + "guid": "annual-sponsor", + "status": "active", + "name": "Annual sponsor", + "description": "Yearly sponsorship for sustained development of Code-Graph-RAG as open infrastructure for AI-powered codebase understanding.", + "amount": 25000, + "currency": "USD", + "frequency": "yearly", + "channels": [ + "github-sponsors" + ] + } + ], + "history": [ + { + "year": 2025, + "income": 0, + "expenses": 0, + "taxes": 0, + "currency": "USD", + "description": "Project launched in 2025. No external funding received." + } + ] + } +} diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..bb27de2c9 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,116 @@ +site_name: Code-Graph-RAG +site_url: https://docs.code-graph-rag.com +site_description: >- + Graph-based RAG system that parses multi-language codebases with Tree-sitter, + builds knowledge graphs, and enables natural language querying, editing, + and optimization. +site_author: Vitali Avagyan + +repo_name: vitali87/code-graph-rag +repo_url: https://github.com/vitali87/code-graph-rag +edit_uri: edit/main/docs/ + +copyright: Copyright © 2024 Vitali Avagyan + +theme: + name: material + custom_dir: docs/overrides + logo: assets/logo-icon.png + favicon: assets/favicon.png + font: + text: Inter + code: JetBrains Mono + palette: + - scheme: slate + primary: custom + accent: custom + toggle: + icon: material/brightness-4 + name: Switch to light mode + - scheme: default + primary: custom + accent: custom + toggle: + icon: material/brightness-7 + name: Switch to dark mode + features: + - navigation.instant + - navigation.tracking + - navigation.tabs + - navigation.sections + - navigation.expand + - navigation.top + - search.suggest + - search.highlight + - content.code.copy + - content.code.annotate + - content.tabs.link + - toc.follow + icon: + repo: fontawesome/brands/github + +plugins: + - search + - minify: + minify_html: true + +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.tabbed: + alternate_style: true + - pymdownx.snippets + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - tables + - attr_list + - md_in_html + - toc: + permalink: true + +nav: + - Home: index.md + - Getting Started: + - Installation: getting-started/installation.md + - Configuration: getting-started/configuration.md + - Quick Start: getting-started/quickstart.md + - User Guide: + - CLI Reference: guide/cli-reference.md + - Interactive Querying: guide/interactive-querying.md + - Code Optimization: guide/code-optimization.md + - Graph Export: guide/graph-export.md + - Real-Time Updates: guide/realtime-updates.md + - MCP Server: guide/mcp-server.md + - Python SDK: + - Overview: sdk/overview.md + - Graph Loader: sdk/graph-loader.md + - Cypher Generator: sdk/cypher-generator.md + - Semantic Search: sdk/semantic-search.md + - Architecture: + - Overview: architecture/overview.md + - Graph Schema: architecture/graph-schema.md + - Language Support: architecture/language-support.md + - Advanced: + - Adding Languages: advanced/adding-languages.md + - Ignore Patterns: advanced/ignore-patterns.md + - Building Binaries: advanced/building-binaries.md + - Troubleshooting: advanced/troubleshooting.md + - Contributing: contributing.md + +extra_css: + - stylesheets/extra.css + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/vitali87/code-graph-rag + - icon: fontawesome/brands/python + link: https://pypi.org/project/code-graph-rag/ + generator: false diff --git a/optimize/memory_profile.py b/optimize/memory_profile.py new file mode 100644 index 000000000..eaf98c2e3 --- /dev/null +++ b/optimize/memory_profile.py @@ -0,0 +1,665 @@ +"""Memory allocation profiler for code-graph-rag. + +Profiles the main data structures and parsing pipeline using tracemalloc. +Does NOT require external services (Memgraph, Qdrant). +""" + +import gc +import json +import sys +import tracemalloc +from collections import OrderedDict, defaultdict +from pathlib import Path +from textwrap import dedent + +PROJECT_ROOT = Path(__file__).resolve().parent.parent + +sys.path.insert(0, str(PROJECT_ROOT)) + + +def format_bytes(size: int) -> str: + for unit in ("B", "KiB", "MiB", "GiB"): + if abs(size) < 1024: + return f"{size:.1f} {unit}" + size /= 1024 # type: ignore[assignment] + return f"{size:.1f} TiB" + + +def snapshot_diff(label: str, snap1: tracemalloc.Snapshot, snap2: tracemalloc.Snapshot, top_n: int = 15) -> dict: + stats = snap2.compare_to(snap1, "lineno") + total_diff = sum(s.size_diff for s in stats if s.size_diff > 0) + result = { + "label": label, + "total_new_alloc": total_diff, + "total_new_alloc_human": format_bytes(total_diff), + "top_allocators": [], + } + for stat in stats[:top_n]: + if stat.size_diff > 0: + result["top_allocators"].append({ + "file": str(stat.traceback), + "size_diff": stat.size_diff, + "size_diff_human": format_bytes(stat.size_diff), + "count_diff": stat.count_diff, + }) + return result + + +def measure_object_sizes() -> dict: + """Measure sizes of core Python data structures used in the codebase.""" + results = {} + + # 1. FunctionRegistryTrie: dict + trie node overhead + from codebase_rag.graph_updater import FunctionRegistryTrie + + trie = FunctionRegistryTrie() + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + for i in range(10_000): + qn = f"project.module_{i // 100}.class_{i // 10}.func_{i}" + trie.insert(qn, "Function") + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["FunctionRegistryTrie_10k_insert"] = snapshot_diff( + "FunctionRegistryTrie: insert 10k qualified names", snap_before, snap_after + ) + results["FunctionRegistryTrie_10k_insert"]["entries_size"] = sys.getsizeof(trie._entries) + results["FunctionRegistryTrie_10k_insert"]["entry_count"] = len(trie._entries) + + # Measure trie overhead vs flat dict + flat_dict = {} + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(10_000): + qn = f"project.module_{i // 100}.class_{i // 10}.func_{i}" + flat_dict[qn] = "Function" + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["flat_dict_10k_baseline"] = snapshot_diff( + "Flat dict: 10k entries baseline", snap_before, snap_after + ) + + # 2. SimpleNameLookup: defaultdict[str, set[str]] + simple_lookup: defaultdict[str, set[str]] = defaultdict(set) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(10_000): + simple_name = f"func_{i % 500}" + qn = f"project.module_{i // 100}.class_{i // 10}.{simple_name}" + simple_lookup[simple_name].add(qn) + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["SimpleNameLookup_10k"] = snapshot_diff( + "SimpleNameLookup: 10k entries, 500 unique names", snap_before, snap_after + ) + + # 3. BoundedASTCache with OrderedDict + from codebase_rag.graph_updater import BoundedASTCache + + cache = BoundedASTCache(max_entries=5000, max_memory_mb=512) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + # Simulate storing mock entries (can't use real AST nodes without tree-sitter parsing) + for i in range(1000): + key = Path(f"/fake/path/module_{i}.py") + # Use a placeholder tuple since we can't create real AST nodes without parsing + cache.cache[key] = (None, "python") # type: ignore + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["BoundedASTCache_1k_entries"] = snapshot_diff( + "BoundedASTCache (OrderedDict): 1k entries", snap_before, snap_after + ) + + # 4. node_buffer in MemgraphIngestor pattern + node_buffer: list[tuple[str, dict[str, str | int | float | bool | list[str] | None]]] = [] + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(5000): + node_buffer.append(( + "Function", + { + "qualified_name": f"project.mod_{i // 50}.cls_{i // 10}.fn_{i}", + "name": f"fn_{i}", + "start_line": i * 10, + "end_line": i * 10 + 15, + "path": f"src/mod_{i // 50}/cls_{i // 10}.py", + }, + )) + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["node_buffer_5k"] = snapshot_diff( + "node_buffer: 5k buffered nodes", snap_before, snap_after + ) + + # 5. _rel_groups in MemgraphIngestor pattern + rel_groups: defaultdict[tuple, list[dict]] = defaultdict(list) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(10_000): + pattern = ("Function", "qualified_name", "CALLS", "Function", "qualified_name") + rel_groups[pattern].append({ + "from_val": f"project.mod.fn_{i}", + "to_val": f"project.mod.fn_{i + 1}", + "props": {}, + }) + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["rel_groups_10k"] = snapshot_diff( + "rel_groups: 10k buffered relationships", snap_before, snap_after + ) + + # 6. import_mapping pattern + import_mapping: dict[str, dict[str, str]] = {} + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(2000): + module_qn = f"project.module_{i}" + imports = {} + for j in range(20): + imports[f"import_{j}"] = f"external.package_{j}.symbol_{j}" + import_mapping[module_qn] = imports + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["import_mapping_2k_modules"] = snapshot_diff( + "import_mapping: 2k modules x 20 imports each", snap_before, snap_after + ) + + # 7. class_inheritance pattern + class_inheritance: dict[str, list[str]] = {} + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + for i in range(3000): + class_qn = f"project.module_{i // 30}.Class_{i}" + parents = [f"project.module_{i // 30}.BaseClass_{j}" for j in range(3)] + class_inheritance[class_qn] = parents + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["class_inheritance_3k"] = snapshot_diff( + "class_inheritance: 3k classes x 3 parents", snap_before, snap_after + ) + + return results + + +def measure_tree_sitter_parsing() -> dict: + """Profile memory during tree-sitter parsing of actual Python files.""" + results = {} + + try: + from tree_sitter import Language, Parser + import tree_sitter_python + + py_language = Language(tree_sitter_python.language()) + parser = Parser(py_language) + except Exception as e: + return {"error": f"tree-sitter setup failed: {e}"} + + # Find Python files in the project itself + py_files = sorted(PROJECT_ROOT.glob("codebase_rag/**/*.py")) + if not py_files: + return {"error": "No Python files found"} + + # Profile parsing all project files + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + trees = [] + total_bytes_parsed = 0 + for f in py_files: + try: + source = f.read_bytes() + total_bytes_parsed += len(source) + tree = parser.parse(source) + trees.append((f, tree)) + except Exception: + pass + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["parse_all_project_files"] = snapshot_diff( + f"Parse {len(trees)} Python files ({format_bytes(total_bytes_parsed)} source)", + snap_before, snap_after + ) + results["parse_all_project_files"]["file_count"] = len(trees) + results["parse_all_project_files"]["source_bytes"] = total_bytes_parsed + + # Profile AST node retention + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + root_nodes = [tree.root_node for _, tree in trees] + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["ast_node_retention"] = snapshot_diff( + f"Retaining {len(root_nodes)} AST root nodes", snap_before, snap_after + ) + + # Profile what happens when we walk AST nodes (simulating function extraction) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + all_function_nodes = [] + for root in root_nodes: + stack = [root] + while stack: + node = stack.pop() + if node.type in ("function_definition", "class_definition"): + all_function_nodes.append(node) + stack.extend(node.children) + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["ast_walk_function_extraction"] = snapshot_diff( + f"Walking ASTs, collected {len(all_function_nodes)} function/class nodes", + snap_before, snap_after, + ) + results["ast_walk_function_extraction"]["function_class_count"] = len(all_function_nodes) + + # Cleanup + del trees, root_nodes, all_function_nodes + + return results + + +def measure_graph_loader_json() -> dict: + """Profile GraphLoader JSON loading and indexing with synthetic data.""" + results = {} + + # Create synthetic graph JSON + nodes = [] + relationships = [] + for i in range(5000): + nodes.append({ + "node_id": i, + "labels": ["Function"], + "properties": { + "qualified_name": f"project.module_{i // 50}.class_{i // 10}.func_{i}", + "name": f"func_{i}", + "start_line": i * 10, + "end_line": i * 10 + 15, + "path": f"src/module_{i // 50}/class_{i // 10}.py", + }, + }) + for i in range(8000): + relationships.append({ + "from_id": i % 5000, + "to_id": (i + 1) % 5000, + "type": "CALLS", + "properties": {}, + }) + + graph_data = { + "nodes": nodes, + "relationships": relationships, + "metadata": { + "total_nodes": len(nodes), + "total_relationships": len(relationships), + "exported_at": "2024-01-01T00:00:00Z", + }, + } + + # Write temp file + tmp_path = PROJECT_ROOT / "optimize" / "_tmp_graph.json" + with open(tmp_path, "w") as f: + json.dump(graph_data, f) + + try: + from codebase_rag.graph_loader import GraphLoader + + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + loader = GraphLoader(str(tmp_path)) + loader.load() + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["graph_loader_5k_nodes_8k_rels"] = snapshot_diff( + "GraphLoader: load 5k nodes + 8k relationships from JSON", + snap_before, snap_after, + ) + + # Measure index building + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + loader._build_property_index("qualified_name") + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["graph_loader_property_index"] = snapshot_diff( + "GraphLoader: build property index on qualified_name", + snap_before, snap_after, + ) + + except Exception as e: + results["error"] = str(e) + finally: + tmp_path.unlink(missing_ok=True) + + return results + + +def measure_embedding_cache() -> dict: + """Profile EmbeddingCache with simulated embeddings.""" + results = {} + + try: + from codebase_rag.embedder import EmbeddingCache + + cache = EmbeddingCache() + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + # Simulate 2k embeddings, each 768-dim float vector + for i in range(2000): + content = f"def function_{i}(x, y): return x + y + {i}" + embedding = [float(j) / 768.0 for j in range(768)] + cache.put(content, embedding) + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["embedding_cache_2k_768dim"] = snapshot_diff( + "EmbeddingCache: 2k entries x 768-dim embeddings", + snap_before, snap_after, + ) + results["embedding_cache_2k_768dim"]["cache_dict_size"] = sys.getsizeof(cache._cache) + results["embedding_cache_2k_768dim"]["entry_count"] = len(cache) + + except Exception as e: + results["error"] = str(e) + + return results + + +def measure_gc_pressure() -> dict: + """Measure GC pressure by tracking collections during workload simulation.""" + results = {} + + gc.collect() + gc_stats_before = gc.get_stats() + gc.disable() + + # Simulate a typical file processing workload creating many temporary objects + temp_objects_created = 0 + for i in range(1000): + # Simulate tree-sitter query results (lists of tuples, dicts) + captures = {"function": [f"node_{j}" for j in range(20)]} + for func_name in captures["function"]: + # Simulate qualified name construction (many string concatenations) + parts = ["project", f"module_{i}", f"class_{i // 10}", func_name] + qn = ".".join(parts) + # Simulate property dict construction + props = { + "qualified_name": qn, + "name": func_name, + "start_line": i * 10, + "end_line": i * 10 + 15, + } + temp_objects_created += 1 + del props + + gc.enable() + gc.collect() + gc_stats_after = gc.get_stats() + + results["gc_pressure_simulation"] = { + "label": "GC pressure during simulated file processing (1k files x 20 funcs)", + "temp_objects_created": temp_objects_created, + "gc_gen0_before": gc_stats_before[0], + "gc_gen0_after": gc_stats_after[0], + "gc_gen1_before": gc_stats_before[1], + "gc_gen1_after": gc_stats_after[1], + "gc_gen2_before": gc_stats_before[2], + "gc_gen2_after": gc_stats_after[2], + } + + return results + + +def measure_string_duplication() -> dict: + """Estimate memory wasted on duplicated strings in typical data structures.""" + results = {} + + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + # Simulate how property dicts repeat the same key strings thousands of times + all_dicts: list[dict] = [] + for i in range(5000): + d = { + "qualified_name": f"project.mod_{i // 50}.cls_{i // 10}.fn_{i}", + "name": f"fn_{i}", + "start_line": i * 10, + "end_line": i * 10 + 15, + "path": f"src/mod_{i // 50}/cls_{i // 10}.py", + } + all_dicts.append(d) + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["property_dict_duplication_5k"] = snapshot_diff( + "5k property dicts with repeated key strings", snap_before, snap_after + ) + + # Compare: same data using tuples (no key duplication) + gc.collect() + tracemalloc.clear_traces() + snap_before = tracemalloc.take_snapshot() + + all_tuples: list[tuple] = [] + for i in range(5000): + t = ( + f"project.mod_{i // 50}.cls_{i // 10}.fn_{i}", + f"fn_{i}", + i * 10, + i * 10 + 15, + f"src/mod_{i // 50}/cls_{i // 10}.py", + ) + all_tuples.append(t) + + gc.collect() + snap_after = tracemalloc.take_snapshot() + results["property_tuple_alternative_5k"] = snapshot_diff( + "5k tuples (no key duplication) as alternative", snap_before, snap_after + ) + + return results + + +def measure_peak_usage_full_pipeline() -> dict: + """Simulate the full pipeline memory envelope. + + This exercises the complete data structure lifecycle: + 1. Build FunctionRegistryTrie + 2. Build import mappings + 3. Build class inheritance + 4. Buffer nodes and relationships + 5. Measure peak + """ + results = {} + + gc.collect() + tracemalloc.clear_traces() + snap_baseline = tracemalloc.take_snapshot() + + # Phase 1: Build FunctionRegistryTrie + from codebase_rag.graph_updater import FunctionRegistryTrie + + simple_name_lookup: defaultdict[str, set[str]] = defaultdict(set) + trie = FunctionRegistryTrie(simple_name_lookup=simple_name_lookup) + + for i in range(15_000): + simple_name = f"func_{i % 1000}" + qn = f"project.module_{i // 150}.class_{i // 15}.{simple_name}" + trie.insert(qn, "Function") + simple_name_lookup[simple_name].add(qn) + + gc.collect() + snap_phase1 = tracemalloc.take_snapshot() + results["phase1_trie_15k"] = snapshot_diff( + "Phase 1: FunctionRegistryTrie + SimpleNameLookup (15k entries)", + snap_baseline, snap_phase1, + ) + + # Phase 2: Import mappings + import_mapping: dict[str, dict[str, str]] = {} + for i in range(1500): + module_qn = f"project.module_{i}" + imports = {f"sym_{j}": f"ext.pkg_{j}.sym_{j}" for j in range(25)} + import_mapping[module_qn] = imports + + gc.collect() + snap_phase2 = tracemalloc.take_snapshot() + results["phase2_imports_1500_modules"] = snapshot_diff( + "Phase 2: import_mapping (1500 modules x 25 imports)", + snap_phase1, snap_phase2, + ) + + # Phase 3: Class inheritance + class_inheritance: dict[str, list[str]] = {} + for i in range(5000): + class_qn = f"project.module_{i // 50}.Class_{i}" + parents = [f"project.module_{i // 50}.Base_{j}" for j in range(2)] + class_inheritance[class_qn] = parents + + gc.collect() + snap_phase3 = tracemalloc.take_snapshot() + results["phase3_inheritance_5k"] = snapshot_diff( + "Phase 3: class_inheritance (5k classes x 2 parents)", + snap_phase2, snap_phase3, + ) + + # Phase 4: Node + relationship buffers + node_buffer: list[tuple[str, dict]] = [] + for i in range(10_000): + node_buffer.append(( + "Function", + { + "qualified_name": f"project.mod_{i // 100}.cls_{i // 10}.fn_{i}", + "name": f"fn_{i}", + "start_line": i * 5, + "end_line": i * 5 + 10, + }, + )) + + rel_groups: defaultdict[tuple, list[dict]] = defaultdict(list) + for i in range(20_000): + pattern = ("Function", "qualified_name", "CALLS", "Function", "qualified_name") + rel_groups[pattern].append({ + "from_val": f"project.mod.fn_{i}", + "to_val": f"project.mod.fn_{i + 1}", + "props": {}, + }) + + gc.collect() + snap_phase4 = tracemalloc.take_snapshot() + results["phase4_buffers_10k_nodes_20k_rels"] = snapshot_diff( + "Phase 4: node_buffer (10k) + rel_groups (20k)", + snap_phase3, snap_phase4, + ) + + # Total from baseline + results["total_pipeline_memory"] = snapshot_diff( + "TOTAL: Full pipeline memory (all phases combined)", + snap_baseline, snap_phase4, + ) + + # Peak usage + current, peak = tracemalloc.get_traced_memory() + results["peak_traced_memory"] = { + "current": current, + "current_human": format_bytes(current), + "peak": peak, + "peak_human": format_bytes(peak), + } + + return results + + +def main() -> None: + tracemalloc.start(25) # 25 frames for stack traces + + all_results: dict[str, dict] = {} + + print("=" * 70) + print("MEMORY ALLOCATION PROFILING REPORT") + print("=" * 70) + + print("\n[1/7] Measuring core data structure sizes...") + all_results["data_structures"] = measure_object_sizes() + + print("[2/7] Profiling tree-sitter parsing...") + all_results["tree_sitter"] = measure_tree_sitter_parsing() + + print("[3/7] Profiling GraphLoader JSON loading...") + all_results["graph_loader"] = measure_graph_loader_json() + + print("[4/7] Profiling EmbeddingCache...") + all_results["embedding_cache"] = measure_embedding_cache() + + print("[5/7] Measuring GC pressure...") + all_results["gc_pressure"] = measure_gc_pressure() + + print("[6/7] Measuring string duplication overhead...") + all_results["string_duplication"] = measure_string_duplication() + + print("[7/7] Measuring peak usage in full pipeline simulation...") + all_results["full_pipeline"] = measure_peak_usage_full_pipeline() + + tracemalloc.stop() + + # Print summary report + print("\n" + "=" * 70) + print("RESULTS SUMMARY") + print("=" * 70) + + for section_name, section_data in all_results.items(): + print(f"\n--- {section_name.upper()} ---") + for key, value in section_data.items(): + if isinstance(value, dict) and "label" in value: + total = value.get("total_new_alloc_human", value.get("peak_human", "N/A")) + print(f" {value['label']}") + print(f" Total new allocation: {total}") + if "top_allocators" in value: + for i, alloc in enumerate(value["top_allocators"][:5]): + print(f" [{i+1}] {alloc['size_diff_human']} ({alloc['count_diff']} objects) - {alloc['file'][:80]}") + elif isinstance(value, dict) and "current_human" in value: + print(f" Current traced: {value['current_human']}") + print(f" Peak traced: {value['peak_human']}") + elif isinstance(value, dict) and "temp_objects_created" in value: + print(f" {value['label']}") + print(f" Temp objects created: {value['temp_objects_created']}") + for gen in range(3): + before = value[f"gc_gen{gen}_before"] + after = value[f"gc_gen{gen}_after"] + print(f" Gen{gen}: collections {before['collections']} -> {after['collections']}, collected {before['collected']} -> {after['collected']}") + + # Save detailed JSON + output_path = PROJECT_ROOT / "optimize" / "memory_profile_results.json" + with open(output_path, "w") as f: + json.dump(all_results, f, indent=2, default=str) + print(f"\nDetailed results saved to: {output_path}") + + +if __name__ == "__main__": + main() diff --git a/optimize/memory_profile_results.json b/optimize/memory_profile_results.json new file mode 100644 index 000000000..f8cb642db --- /dev/null +++ b/optimize/memory_profile_results.json @@ -0,0 +1,1482 @@ +{ + "data_structures": { + "FunctionRegistryTrie_10k_insert": { + "label": "FunctionRegistryTrie: insert 10k qualified names", + "total_new_alloc": 3681520, + "total_new_alloc_human": "3.5 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:56", + "size_diff": 1079880, + "size_diff_human": "1.0 MiB", + "count_diff": 8999 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:51", + "size_diff": 1062648, + "size_diff_human": "1.0 MiB", + "count_diff": 13203 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:61", + "size_diff": 776790, + "size_diff_human": "758.6 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:46", + "size_diff": 553818, + "size_diff_human": "540.8 KiB", + "count_diff": 11101 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:44", + "size_diff": 207672, + "size_diff_human": "202.8 KiB", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 312, + "size_diff_human": "312.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 312, + "size_diff_human": "312.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:60", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ], + "entries_size": 207616, + "entry_count": 10000 + }, + "flat_dict_10k_baseline": { + "label": "Flat dict: 10k entries baseline", + "total_new_alloc": 985022, + "total_new_alloc_human": "961.9 KiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:78", + "size_diff": 776790, + "size_diff_human": "758.6 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:79", + "size_diff": 207552, + "size_diff_human": "202.7 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 296, + "size_diff_human": "296.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 296, + "size_diff_human": "296.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:77", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "SimpleNameLookup_10k": { + "label": "SimpleNameLookup: 10k entries, 500 unique names", + "total_new_alloc": 1935779, + "total_new_alloc_human": "1.8 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:94", + "size_diff": 1144992, + "size_diff_human": "1.1 MiB", + "count_diff": 1001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:93", + "size_diff": 765700, + "size_diff_human": "747.8 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:92", + "size_diff": 24439, + "size_diff_human": "23.9 KiB", + "count_diff": 501 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 280, + "size_diff_human": "280.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 280, + "size_diff_human": "280.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:91", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "BoundedASTCache_1k_entries": { + "label": "BoundedASTCache (OrderedDict): 1k entries", + "total_new_alloc": 585087, + "total_new_alloc_human": "571.4 KiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:404", + "size_diff": 141935, + "size_diff_human": "138.6 KiB", + "count_diff": 3001 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:1167", + "size_diff": 104000, + "size_diff_human": "101.6 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:113", + "size_diff": 85272, + "size_diff_human": "83.3 KiB", + "count_diff": 1002 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:111", + "size_diff": 64890, + "size_diff_human": "63.4 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:432", + "size_diff": 64890, + "size_diff_human": "63.4 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:359", + "size_diff": 55944, + "size_diff_human": "54.6 KiB", + "count_diff": 999 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:528", + "size_diff": 35540, + "size_diff_human": "34.7 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:377", + "size_diff": 32000, + "size_diff_human": "31.2 KiB", + "count_diff": 1000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 264, + "size_diff_human": "264.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 264, + "size_diff_human": "264.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:110", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "node_buffer_5k": { + "label": "node_buffer: 5k buffered nodes", + "total_new_alloc": 2460116, + "total_new_alloc_human": "2.3 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:128", + "size_diff": 920000, + "size_diff_human": "898.4 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:129", + "size_diff": 352290, + "size_diff_human": "344.0 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:126", + "size_diff": 321600, + "size_diff_human": "314.1 KiB", + "count_diff": 4997 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:133", + "size_diff": 308400, + "size_diff_human": "301.2 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:130", + "size_diff": 238890, + "size_diff_human": "233.3 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:132", + "size_diff": 159200, + "size_diff_human": "155.5 KiB", + "count_diff": 4975 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:131", + "size_diff": 159168, + "size_diff_human": "155.4 KiB", + "count_diff": 4974 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 240, + "size_diff_human": "240.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 240, + "size_diff_human": "240.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:125", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "rel_groups_10k": { + "label": "rel_groups: 10k buffered relationships", + "total_new_alloc": 3763656, + "total_new_alloc_human": "3.6 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:149", + "size_diff": 1925336, + "size_diff_human": "1.8 MiB", + "count_diff": 20003 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:152", + "size_diff": 640000, + "size_diff_human": "625.0 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:151", + "size_diff": 598894, + "size_diff_human": "584.9 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:150", + "size_diff": 598890, + "size_diff_human": "584.9 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 224, + "size_diff_human": "224.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 224, + "size_diff_human": "224.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:147", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "import_mapping_2k_modules": { + "label": "import_mapping: 2k modules x 20 imports each", + "total_new_alloc": 5839298, + "total_new_alloc_human": "5.6 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:169", + "size_diff": 5540000, + "size_diff_human": "5.3 MiB", + "count_diff": 82000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:167", + "size_diff": 128000, + "size_diff_human": "125.0 KiB", + "count_diff": 2000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:166", + "size_diff": 118890, + "size_diff_human": "116.1 KiB", + "count_diff": 2000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:170", + "size_diff": 51904, + "size_diff_human": "50.7 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 208, + "size_diff_human": "208.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 208, + "size_diff_human": "208.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:165", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "class_inheritance_3k": { + "label": "class_inheritance: 3k classes x 3 parents", + "total_new_alloc": 1202898, + "total_new_alloc_human": "1.1 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:184", + "size_diff": 893044, + "size_diff_human": "872.1 KiB", + "count_diff": 14999 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:183", + "size_diff": 205590, + "size_diff_human": "200.8 KiB", + "count_diff": 3000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:185", + "size_diff": 103792, + "size_diff_human": "101.4 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 192, + "size_diff_human": "192.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 192, + "size_diff_human": "192.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:182", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + } + }, + "tree_sitter": { + "parse_all_project_files": { + "label": "Parse 343 Python files (5.4 MiB source)", + "total_new_alloc": 88243514, + "total_new_alloc_human": "84.2 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:224", + "size_diff": 82541776, + "size_diff_human": "78.7 MiB", + "count_diff": 903039 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:1020", + "size_diff": 5679234, + "size_diff_human": "5.4 MiB", + "count_diff": 337 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:225", + "size_diff": 22024, + "size_diff_human": "21.5 KiB", + "count_diff": 344 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 168, + "size_diff_human": "168.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 168, + "size_diff_human": "168.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:218", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:223", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ], + "file_count": 343, + "source_bytes": 5668113 + }, + "ast_node_retention": { + "label": "Retaining 343 AST root nodes", + "total_new_alloc": 25128, + "total_new_alloc_human": "24.5 KiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:243", + "size_diff": 24768, + "size_diff_human": "24.2 KiB", + "count_diff": 344 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 152, + "size_diff_human": "152.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 152, + "size_diff_human": "152.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + } + ] + }, + "ast_walk_function_extraction": { + "label": "Walking ASTs, collected 5578 function/class nodes", + "total_new_alloc": 91566344, + "total_new_alloc_human": "87.3 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:263", + "size_diff": 91518856, + "size_diff_human": "87.3 MiB", + "count_diff": 1673834 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:262", + "size_diff": 47104, + "size_diff_human": "46.0 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 136, + "size_diff_human": "136.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 136, + "size_diff_human": "136.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:258", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + } + ], + "function_class_count": 5578 + } + }, + "graph_loader": { + "graph_loader_5k_nodes_8k_rels": { + "label": "GraphLoader: load 5k nodes + 8k relationships from JSON", + "total_new_alloc": 9476802, + "total_new_alloc_human": "9.0 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/json/decoder.py:353", + "size_diff": 6787632, + "size_diff_human": "6.5 MiB", + "count_diff": 111693 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:74", + "size_diff": 770760, + "size_diff_human": "752.7 KiB", + "count_diff": 16000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:83", + "size_diff": 587480, + "size_diff_human": "573.7 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:82", + "size_diff": 587480, + "size_diff_human": "573.7 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:61", + "size_diff": 443080, + "size_diff_human": "432.7 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:68", + "size_diff": 147480, + "size_diff_human": "144.0 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:80", + "size_diff": 67168, + "size_diff_human": "65.6 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:70", + "size_diff": 41880, + "size_diff_human": "40.9 KiB", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:66", + "size_diff": 41824, + "size_diff_human": "40.8 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/.venv/lib/python3.12/site-packages/loguru/_logger.py:2003", + "size_diff": 200, + "size_diff_human": "200.0 B", + "count_diff": 4 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/pathlib.py:404", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:52", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/.venv/lib/python3.12/site-packages/loguru/_handler.py:120", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 1 + } + ] + }, + "graph_loader_property_index": { + "label": "GraphLoader: build property index on qualified_name", + "total_new_alloc": 544224, + "total_new_alloc_human": "531.5 KiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:99", + "size_diff": 440120, + "size_diff_human": "429.8 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_loader.py:100", + "size_diff": 103856, + "size_diff_human": "101.4 KiB", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 96, + "size_diff_human": "96.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 96, + "size_diff_human": "96.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + } + ] + } + }, + "embedding_cache": { + "embedding_cache_2k_768dim": { + "label": "EmbeddingCache: 2k entries x 768-dim embeddings", + "total_new_alloc": 50998237, + "total_new_alloc_human": "48.6 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:375", + "size_diff": 50736000, + "size_diff_human": "48.4 MiB", + "count_diff": 1540000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/embedder.py:26", + "size_diff": 210000, + "size_diff_human": "205.1 KiB", + "count_diff": 2000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/embedder.py:32", + "size_diff": 51904, + "size_diff_human": "50.7 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:374", + "size_diff": 85, + "size_diff_human": "85.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:373", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ], + "cache_dict_size": 51968, + "entry_count": 2000 + } + }, + "gc_pressure": { + "gc_pressure_simulation": { + "label": "GC pressure during simulated file processing (1k files x 20 funcs)", + "temp_objects_created": 20000, + "gc_gen0_before": { + "collections": 1785, + "collected": 8016, + "uncollectable": 0 + }, + "gc_gen0_after": { + "collections": 1785, + "collected": 8016, + "uncollectable": 0 + }, + "gc_gen1_before": { + "collections": 155, + "collected": 1262, + "uncollectable": 0 + }, + "gc_gen1_after": { + "collections": 155, + "collected": 1262, + "uncollectable": 0 + }, + "gc_gen2_before": { + "collections": 40, + "collected": 279, + "uncollectable": 0 + }, + "gc_gen2_after": { + "collections": 41, + "collected": 279, + "uncollectable": 0 + } + } + }, + "string_duplication": { + "property_dict_duplication_5k": { + "label": "5k property dicts with repeated key strings", + "total_new_alloc": 2180068, + "total_new_alloc_human": "2.1 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:449", + "size_diff": 920000, + "size_diff_human": "898.4 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:450", + "size_diff": 352290, + "size_diff_human": "344.0 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:454", + "size_diff": 308400, + "size_diff_human": "301.2 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:451", + "size_diff": 238890, + "size_diff_human": "233.3 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:453", + "size_diff": 159200, + "size_diff_human": "155.5 KiB", + "count_diff": 4975 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:452", + "size_diff": 159168, + "size_diff_human": "155.4 KiB", + "count_diff": 4974 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:456", + "size_diff": 41824, + "size_diff_human": "40.8 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:447", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:448", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "property_tuple_alternative_5k": { + "label": "5k tuples (no key duplication) as alternative", + "total_new_alloc": 1660012, + "total_new_alloc_human": "1.6 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:471", + "size_diff": 400000, + "size_diff_human": "390.6 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:472", + "size_diff": 352290, + "size_diff_human": "344.0 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:476", + "size_diff": 308400, + "size_diff_human": "301.2 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:473", + "size_diff": 238890, + "size_diff_human": "233.3 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:475", + "size_diff": 159200, + "size_diff_human": "155.5 KiB", + "count_diff": 4975 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:474", + "size_diff": 159168, + "size_diff_human": "155.4 KiB", + "count_diff": 4974 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:478", + "size_diff": 41824, + "size_diff_human": "40.8 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:470", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + } + }, + "full_pipeline": { + "phase1_trie_15k": { + "label": "Phase 1: FunctionRegistryTrie + SimpleNameLookup (15k entries)", + "total_new_alloc": 6411617, + "total_new_alloc_human": "6.1 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:56", + "size_diff": 1679760, + "size_diff_human": "1.6 MiB", + "count_diff": 13998 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:51", + "size_diff": 1574648, + "size_diff_human": "1.5 MiB", + "count_diff": 18203 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:513", + "size_diff": 1150200, + "size_diff_human": "1.1 MiB", + "count_diff": 15000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:46", + "size_diff": 788278, + "size_diff_human": "769.8 KiB", + "count_diff": 16101 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:515", + "size_diff": 754088, + "size_diff_human": "736.4 KiB", + "count_diff": 2002 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:44", + "size_diff": 415088, + "size_diff_human": "405.4 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:512", + "size_diff": 48939, + "size_diff_human": "47.8 KiB", + "count_diff": 1001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:509", + "size_diff": 176, + "size_diff_human": "176.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:508", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:40", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:39", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:511", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "phase2_imports_1500_modules": { + "label": "Phase 2: import_mapping (1500 modules x 25 imports)", + "total_new_alloc": 5287898, + "total_new_alloc_human": "5.0 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:528", + "size_diff": 5140500, + "size_diff_human": "4.9 MiB", + "count_diff": 78000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:527", + "size_diff": 88890, + "size_diff_human": "86.8 KiB", + "count_diff": 1500 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:529", + "size_diff": 51904, + "size_diff_human": "50.7 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:39", + "size_diff": 2888, + "size_diff_human": "2.8 KiB", + "count_diff": 31 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:173", + "size_diff": 1872, + "size_diff_human": "1.8 KiB", + "count_diff": 15 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:23", + "size_diff": 768, + "size_diff_human": "768.0 B", + "count_diff": 16 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:503", + "size_diff": 192, + "size_diff_human": "192.0 B", + "count_diff": 6 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:502", + "size_diff": 192, + "size_diff_human": "192.0 B", + "count_diff": 6 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:31", + "size_diff": 184, + "size_diff_human": "184.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:519", + "size_diff": 120, + "size_diff_human": "120.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:525", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:35", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + } + ] + }, + "phase3_inheritance_5k": { + "label": "Phase 3: class_inheritance (5k classes x 2 parents)", + "total_new_alloc": 1542592, + "total_new_alloc_human": "1.5 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:542", + "size_diff": 1089000, + "size_diff_human": "1.0 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:541", + "size_diff": 343390, + "size_diff_human": "335.3 KiB", + "count_diff": 5000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:543", + "size_diff": 103792, + "size_diff_human": "101.4 KiB", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:39", + "size_diff": 2888, + "size_diff_human": "2.8 KiB", + "count_diff": 31 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:173", + "size_diff": 1961, + "size_diff_human": "1.9 KiB", + "count_diff": 15 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:23", + "size_diff": 765, + "size_diff_human": "765.0 B", + "count_diff": 16 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:31", + "size_diff": 184, + "size_diff_human": "184.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:502", + "size_diff": 160, + "size_diff_human": "160.0 B", + "count_diff": 5 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:560", + "size_diff": 80, + "size_diff_human": "80.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:423", + "size_diff": 72, + "size_diff_human": "72.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:503", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:539", + "size_diff": 64, + "size_diff_human": "64.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:558", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:35", + "size_diff": 56, + "size_diff_human": "56.0 B", + "count_diff": 1 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:540", + "size_diff": 32, + "size_diff_human": "32.0 B", + "count_diff": 1 + } + ] + }, + "phase4_buffers_10k_nodes_20k_rels": { + "label": "Phase 4: node_buffer (10k) + rel_groups (20k)", + "total_new_alloc": 11864970, + "total_new_alloc_human": "11.3 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:568", + "size_diff": 3853176, + "size_diff_human": "3.7 MiB", + "count_diff": 40003 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:557", + "size_diff": 1840000, + "size_diff_human": "1.8 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:571", + "size_diff": 1280000, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:570", + "size_diff": 1208894, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:569", + "size_diff": 1208890, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:558", + "size_diff": 706790, + "size_diff_human": "690.2 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:555", + "size_diff": 645120, + "size_diff_human": "630.0 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:559", + "size_diff": 478890, + "size_diff_human": "467.7 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:561", + "size_diff": 318400, + "size_diff_human": "310.9 KiB", + "count_diff": 9950 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:560", + "size_diff": 318336, + "size_diff_human": "310.9 KiB", + "count_diff": 9948 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:39", + "size_diff": 2888, + "size_diff_human": "2.8 KiB", + "count_diff": 31 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:173", + "size_diff": 1961, + "size_diff_human": "1.9 KiB", + "count_diff": 15 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:23", + "size_diff": 765, + "size_diff_human": "765.0 B", + "count_diff": 16 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:31", + "size_diff": 184, + "size_diff_human": "184.0 B", + "count_diff": 2 + }, + { + "file": "/Users/vitaliavagyan/.local/share/uv/python/cpython-3.12.2-macos-aarch64-none/lib/python3.12/tracemalloc.py:126", + "size_diff": 96, + "size_diff_human": "96.0 B", + "count_diff": 3 + } + ] + }, + "total_pipeline_memory": { + "label": "TOTAL: Full pipeline memory (all phases combined)", + "total_new_alloc": 25106981, + "total_new_alloc_human": "23.9 MiB", + "top_allocators": [ + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:528", + "size_diff": 5140500, + "size_diff_human": "4.9 MiB", + "count_diff": 78000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:568", + "size_diff": 3853176, + "size_diff_human": "3.7 MiB", + "count_diff": 40003 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:557", + "size_diff": 1840000, + "size_diff_human": "1.8 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:56", + "size_diff": 1679760, + "size_diff_human": "1.6 MiB", + "count_diff": 13998 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:51", + "size_diff": 1574648, + "size_diff_human": "1.5 MiB", + "count_diff": 18203 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:571", + "size_diff": 1280000, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:570", + "size_diff": 1208894, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:569", + "size_diff": 1208890, + "size_diff_human": "1.2 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:513", + "size_diff": 1150200, + "size_diff_human": "1.1 MiB", + "count_diff": 15000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:542", + "size_diff": 1089000, + "size_diff_human": "1.0 MiB", + "count_diff": 20000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/codebase_rag/graph_updater.py:46", + "size_diff": 788278, + "size_diff_human": "769.8 KiB", + "count_diff": 16101 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:515", + "size_diff": 754088, + "size_diff_human": "736.4 KiB", + "count_diff": 2002 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:558", + "size_diff": 706790, + "size_diff_human": "690.2 KiB", + "count_diff": 10000 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:555", + "size_diff": 645120, + "size_diff_human": "630.0 KiB", + "count_diff": 10001 + }, + { + "file": "/Users/vitaliavagyan/Documents/code-graph-rag/optimize/memory_profile.py:559", + "size_diff": 478890, + "size_diff_human": "467.7 KiB", + "count_diff": 10000 + } + ] + }, + "peak_traced_memory": { + "current": 25128953, + "current_human": "24.0 MiB", + "peak": 25135561, + "peak_human": "24.0 MiB" + } + } +} diff --git a/optimize/profile_io.py b/optimize/profile_io.py new file mode 100644 index 000000000..c71d98ecd --- /dev/null +++ b/optimize/profile_io.py @@ -0,0 +1,431 @@ +import hashlib +import json +import statistics +import sys +import time +from collections import defaultdict +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from codebase_rag import constants as cs +from codebase_rag.graph_updater import _hash_file, _load_hash_cache, _save_hash_cache +from codebase_rag.parser_loader import load_parsers +from codebase_rag.parsers.utils import safe_decode_with_fallback +from codebase_rag.services.protobuf_service import ProtobufFileIngestor +from codebase_rag.utils.path_utils import should_skip_path + + +REPO_PATH = Path(__file__).resolve().parent.parent +RUNS = 5 + + +def benchmark(func, *args, runs=RUNS, label=""): + times = [] + result = None + for _ in range(runs): + start = time.perf_counter() + result = func(*args) + elapsed = time.perf_counter() - start + times.append(elapsed) + avg = statistics.mean(times) + std = statistics.stdev(times) if len(times) > 1 else 0.0 + med = statistics.median(times) + return { + "label": label, + "avg_ms": avg * 1000, + "median_ms": med * 1000, + "std_ms": std * 1000, + "min_ms": min(times) * 1000, + "max_ms": max(times) * 1000, + "runs": runs, + "result": result, + } + + +def collect_py_files(): + files = [] + for f in REPO_PATH.rglob("*.py"): + if not should_skip_path(f, REPO_PATH): + files.append(f) + return files + + +def profile_file_hashing(files): + print("\n=== FILE HASHING (SHA-256) ===") + results = [] + total_bytes = 0 + for f in files: + total_bytes += f.stat().st_size + + def hash_all(): + for f in files: + _hash_file(f) + + r = benchmark(hash_all, label=f"hash {len(files)} files ({total_bytes/1024:.0f} KB)") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, median={r['median_ms']:.2f}ms, std={r['std_ms']:.2f}ms") + + per_file_ms = r['avg_ms'] / len(files) if files else 0 + print(f" Per file average: {per_file_ms:.3f}ms") + print(f" Throughput: {total_bytes / (r['avg_ms']/1000) / 1024 / 1024:.1f} MB/s") + + single_sizes = [(f, f.stat().st_size) for f in files] + single_sizes.sort(key=lambda x: x[1], reverse=True) + for f, sz in single_sizes[:5]: + r2 = benchmark(_hash_file, f, runs=10, label=f"hash {f.relative_to(REPO_PATH)} ({sz}B)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.3f}ms") + + return results + + +def profile_file_reading(files): + print("\n=== FILE READING (read_bytes + parse) ===") + results = [] + + def read_all_bytes(): + for f in files: + f.read_bytes() + + total_bytes = sum(f.stat().st_size for f in files) + r = benchmark(read_all_bytes, label=f"read_bytes {len(files)} files ({total_bytes/1024:.0f} KB)") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, median={r['median_ms']:.2f}ms") + print(f" Throughput: {total_bytes / (r['avg_ms']/1000) / 1024 / 1024:.1f} MB/s") + + def read_all_text(): + for f in files: + f.read_text(encoding="utf-8") + + r2 = benchmark(read_all_text, label=f"read_text {len(files)} files") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms, median={r2['median_ms']:.2f}ms") + + return results + + +def profile_tree_sitter_parsing(files): + print("\n=== TREE-SITTER PARSING ===") + results = [] + parsers, queries = load_parsers() + py_parser = parsers.get(cs.SupportedLanguage.PYTHON) + if not py_parser: + print(" Python parser not available, skipping") + return results + + py_files = [f for f in files if f.suffix == ".py"] + file_bytes = [(f, f.read_bytes()) for f in py_files] + + def parse_all(): + for f, src in file_bytes: + py_parser.parse(src) + + r = benchmark(parse_all, label=f"parse {len(py_files)} Python files") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, median={r['median_ms']:.2f}ms") + per_file_ms = r['avg_ms'] / len(py_files) if py_files else 0 + print(f" Per file average: {per_file_ms:.3f}ms") + + file_bytes_sorted = sorted(file_bytes, key=lambda x: len(x[1]), reverse=True) + for f, src in file_bytes_sorted[:5]: + r2 = benchmark(py_parser.parse, src, runs=10, + label=f"parse {f.relative_to(REPO_PATH)} ({len(src)}B)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.3f}ms") + + return results + + +def profile_json_serialization(): + print("\n=== JSON SERIALIZATION ===") + results = [] + + small = {"key": "value", "num": 42, "arr": [1, 2, 3]} + r = benchmark(json.dumps, small, runs=1000, label="json.dumps small dict") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.4f}ms") + + medium_nodes = [ + {"node_id": i, "labels": ["Function"], "properties": {"name": f"func_{i}", "path": f"src/mod_{i//10}.py", "start_line": i*10, "end_line": i*10+5}} + for i in range(1000) + ] + medium_rels = [ + {"from_id": i, "to_id": (i+1) % 1000, "type": "CALLS", "properties": {}} + for i in range(2000) + ] + medium = {"nodes": medium_nodes, "relationships": medium_rels, "metadata": {"total_nodes": 1000, "total_relationships": 2000}} + + r2 = benchmark(json.dumps, medium, runs=5, label=f"json.dumps graph (1K nodes, 2K rels, {len(json.dumps(medium))/1024:.0f}KB)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms") + + json_str = json.dumps(medium) + r3 = benchmark(json.loads, json_str, runs=5, label=f"json.loads graph ({len(json_str)/1024:.0f}KB)") + results.append(r3) + print(f" {r3['label']}: avg={r3['avg_ms']:.2f}ms") + + large_nodes = medium_nodes * 10 + large_rels = medium_rels * 10 + large = {"nodes": large_nodes, "relationships": large_rels, "metadata": {"total_nodes": 10000, "total_relationships": 20000}} + large_json = json.dumps(large) + r4 = benchmark(json.dumps, large, runs=3, label=f"json.dumps large graph (10K nodes, 20K rels, {len(large_json)/1024:.0f}KB)") + results.append(r4) + print(f" {r4['label']}: avg={r4['avg_ms']:.2f}ms") + + r5 = benchmark(json.loads, large_json, runs=3, label=f"json.loads large graph ({len(large_json)/1024:.0f}KB)") + results.append(r5) + print(f" {r5['label']}: avg={r5['avg_ms']:.2f}ms") + + with_indent = lambda d: json.dumps(d, indent=2, ensure_ascii=False) + r6 = benchmark(with_indent, large, runs=3, label=f"json.dumps large graph (indent=2)") + results.append(r6) + print(f" {r6['label']}: avg={r6['avg_ms']:.2f}ms") + + return results + + +def profile_protobuf_serialization(): + print("\n=== PROTOBUF SERIALIZATION ===") + results = [] + try: + import codec.schema_pb2 as pb + except ImportError: + print(" protobuf schema not available, skipping") + return results + + import tempfile, shutil + tmp_dir = Path(tempfile.mkdtemp()) + try: + ingestor = ProtobufFileIngestor(output_path=str(tmp_dir)) + + for i in range(100): + ingestor.ensure_node_batch("Function", { + "qualified_name": f"project.mod.func_{i}", + "name": f"func_{i}", + "path": f"src/mod.py", + "start_line": i * 10, + "end_line": i * 10 + 5, + }) + for i in range(200): + ingestor.ensure_relationship_batch( + ("Function", "qualified_name", f"project.mod.func_{i % 100}"), + "CALLS", + ("Function", "qualified_name", f"project.mod.func_{(i+1) % 100}"), + ) + + def flush_protobuf(): + ingestor.flush_all() + + r = benchmark(flush_protobuf, runs=5, label="protobuf flush (100 nodes, 200 rels)") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms") + + index_file = tmp_dir / "graph_code_index.pb" + if index_file.exists(): + size = index_file.stat().st_size + print(f" Output size: {size} bytes") + + def read_protobuf(): + idx = pb.GraphCodeIndex() + idx.ParseFromString(index_file.read_bytes()) + return idx + + r2 = benchmark(read_protobuf, runs=10, label=f"protobuf parse ({size}B)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.3f}ms") + + for node_path in tmp_dir.iterdir(): + if node_path.suffix == ".pb": + sz = node_path.stat().st_size + print(f" Protobuf file: {node_path.name} ({sz} bytes)") + + finally: + shutil.rmtree(tmp_dir) + + return results + + +def profile_hash_cache_io(): + print("\n=== HASH CACHE I/O ===") + results = [] + + import tempfile + tmp = Path(tempfile.mkdtemp()) + try: + cache_data = {f"path/to/file_{i}.py": hashlib.sha256(f"content_{i}".encode()).hexdigest() for i in range(1000)} + cache_path = tmp / ".file_hashes.json" + + r = benchmark(_save_hash_cache, cache_path, cache_data, runs=5, label=f"save hash cache ({len(cache_data)} entries)") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, size={cache_path.stat().st_size/1024:.1f}KB") + + r2 = benchmark(_load_hash_cache, cache_path, runs=5, label=f"load hash cache ({len(cache_data)} entries)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms") + finally: + import shutil + shutil.rmtree(tmp) + + return results + + +def profile_file_traversal(): + print("\n=== FILESYSTEM TRAVERSAL ===") + results = [] + + def rglob_all(): + return list(REPO_PATH.rglob("*")) + + r = benchmark(rglob_all, runs=5, label="rglob('*') entire repo") + results.append(r) + all_paths = r['result'] + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, found {len(all_paths)} paths") + + def rglob_with_filter(): + eligible = [] + for f in REPO_PATH.rglob("*"): + if f.is_file() and not should_skip_path(f, REPO_PATH): + eligible.append(f) + return eligible + + r2 = benchmark(rglob_with_filter, runs=5, label="rglob + should_skip_path filter") + results.append(r2) + eligible = r2['result'] + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms, eligible {len(eligible)} files") + + overhead_ms = r2['avg_ms'] - r['avg_ms'] + print(f" Filter overhead: {overhead_ms:.2f}ms") + + return results + + +def profile_source_extraction(): + print("\n=== SOURCE EXTRACTION ===") + results = [] + from codebase_rag.utils.source_extraction import extract_source_lines + + py_files = [f for f in REPO_PATH.rglob("*.py") + if not should_skip_path(f, REPO_PATH) and f.stat().st_size > 100] + if not py_files: + print(" No Python files found") + return results + + target = py_files[0] + line_count = len(target.read_text().splitlines()) + + def extract_50_lines(): + return extract_source_lines(target, 1, min(50, line_count)) + + r = benchmark(extract_50_lines, runs=20, label=f"extract 50 lines from {target.relative_to(REPO_PATH)}") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.3f}ms") + + def extract_all_files_10_lines(): + for f in py_files[:50]: + extract_source_lines(f, 1, 10) + + r2 = benchmark(extract_all_files_10_lines, runs=5, label=f"extract 10 lines from {min(50, len(py_files))} files") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms") + + return results + + +def profile_embedding_cache_io(): + print("\n=== EMBEDDING CACHE I/O ===") + results = [] + import tempfile + + from codebase_rag.embedder import EmbeddingCache + + tmp = Path(tempfile.mkdtemp()) + try: + cache = EmbeddingCache(path=tmp / "embedding_cache.json") + for i in range(500): + cache.put(f"def func_{i}(): pass", [float(j) / 768 for j in range(768)]) + + def save_cache(): + cache.save() + + r = benchmark(save_cache, runs=5, label=f"save embedding cache ({len(cache)} entries, 768-dim)") + results.append(r) + size = (tmp / "embedding_cache.json").stat().st_size + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, size={size/1024/1024:.2f}MB") + + def load_cache(): + new_cache = EmbeddingCache(path=tmp / "embedding_cache.json") + new_cache.load() + return new_cache + + r2 = benchmark(load_cache, runs=5, label=f"load embedding cache ({size/1024/1024:.2f}MB)") + results.append(r2) + print(f" {r2['label']}: avg={r2['avg_ms']:.2f}ms") + print(f" Throughput: {size / (r2['avg_ms']/1000) / 1024 / 1024:.1f} MB/s") + finally: + import shutil + shutil.rmtree(tmp) + + return results + + +def profile_directory_structure(): + print("\n=== DIRECTORY STRUCTURE IDENTIFICATION ===") + results = [] + from codebase_rag.language_spec import LANGUAGE_SPECS + + package_indicators = set() + for spec in LANGUAGE_SPECS.values(): + package_indicators.update(spec.package_indicators) + + def identify_packages(): + dirs = set() + for p in REPO_PATH.rglob("*"): + if p.is_dir() and not should_skip_path(p, REPO_PATH): + dirs.add(p) + packages = 0 + for d in dirs: + for indicator in package_indicators: + if (d / indicator).exists(): + packages += 1 + break + return packages + + r = benchmark(identify_packages, runs=5, label="identify package structure") + results.append(r) + print(f" {r['label']}: avg={r['avg_ms']:.2f}ms, packages={r['result']}") + + return results + + +def main(): + print("=" * 70) + print("I/O AND SERIALIZATION LATENCY PROFILE") + print(f"Repo: {REPO_PATH}") + print("=" * 70) + + all_results = [] + files = collect_py_files() + print(f"\nPython files for profiling: {len(files)}") + + all_results.extend(profile_file_traversal()) + all_results.extend(profile_file_reading(files)) + all_results.extend(profile_file_hashing(files)) + all_results.extend(profile_tree_sitter_parsing(files)) + all_results.extend(profile_source_extraction()) + all_results.extend(profile_json_serialization()) + all_results.extend(profile_protobuf_serialization()) + all_results.extend(profile_hash_cache_io()) + all_results.extend(profile_embedding_cache_io()) + all_results.extend(profile_directory_structure()) + + print("\n" + "=" * 70) + print("RANKED SUMMARY (by avg wall-clock time)") + print("=" * 70) + ranked = sorted(all_results, key=lambda x: x['avg_ms'], reverse=True) + for i, r in enumerate(ranked, 1): + print(f" {i:2d}. [{r['avg_ms']:10.2f}ms] {r['label']}") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 12160521b..1317694d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,28 +1,59 @@ [project] name = "code-graph-rag" -version = "0.0.60" +version = "0.0.184" description = "The ultimate RAG for your monorepo. Query, understand, and edit multi-language codebases with the power of AI and knowledge graphs" -readme = "README.md" +readme = "PYPI_README.md" requires-python = ">=3.12" +license = "MIT" +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Code Generators", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] +keywords = [ + "rag", + "retrieval-augmented-generation", + "knowledge-graph", + "code-analysis", + "tree-sitter", + "mcp", + "mcp-server", + "llm", + "graph-database", + "semantic-search", + "codebase", + "memgraph", + "developer-tools", + "monorepo", +] dependencies = [ "loguru>=0.7.3", - "mcp>=1.21.1", - "pydantic-ai>=1.27.0", - "pydantic-settings>=2.0.0", - "pymgclient>=1.4.0", - "python-dotenv>=1.1.0", + "mcp>=1.25.0", + "pydantic-ai>=1.70.0", + "pydantic-settings>=2.12.0", + "pymgclient>=1.5.1", + "python-dotenv>=1.2.1", + "tiktoken>=0.12.0", "toml>=0.10.2", - "tree-sitter-python>=0.23.6", - "tree-sitter==0.25.0", + "tree-sitter-python>=0.25.0", + "tree-sitter==0.25.2", "watchdog>=6.0.0", - "typer>=0.12.5", - "rich>=13.7.1", - "prompt-toolkit>=3.0.0", + "typer>=0.21.1", + "rich>=14.2.0", + "prompt-toolkit>=3.0.52", "diff-match-patch>=20241021", - "click>=8.0.0", - "protobuf>=5.27.0", + "click>=8.3.1", + "protobuf>=6.33.5", "defusedxml>=0.7.1", - "huggingface-hub[hf-xet]>=0.36.0", + "huggingface-hub[hf-xet]>=1.7.2", ] [project.scripts] @@ -32,8 +63,9 @@ cgr = "codebase_rag.cli:app" [tool.uv] package = true -[tool.setuptools] -packages = ["codebase_rag", "codec"] +[tool.setuptools.packages.find] +include = ["codebase_rag*", "codec*", "cgr*"] +exclude = ["*.tests", "*.tests.*"] [project.optional-dependencies] test = [ @@ -52,8 +84,10 @@ treesitter-full = [ "tree-sitter-go>=0.23.4", "tree-sitter-scala>=0.24.0", "tree-sitter-java>=0.23.5", + "tree-sitter-c>=0.24.1", "tree-sitter-cpp>=0.23.0", "tree-sitter-lua>=0.0.19", + "tree-sitter-php>=0.24.1", ] semantic = [ @@ -65,7 +99,7 @@ semantic = [ [tool.ruff] line-length = 88 target-version = "py312" -exclude = ["codec/"] +exclude = ["codec/", "benchmarks/", "optimize/"] [tool.ruff.lint] select = ["E", "F", "W", "I", "UP", "PL", "T201"] @@ -83,6 +117,7 @@ ignore = [ [tool.ruff.lint.per-file-ignores] "**/tests/**" = ["T201"] +"benchmarks/**" = ["T201"] [tool.ruff.format] quote-style = "double" @@ -91,7 +126,7 @@ quote-style = "double" python-version = "3.12" [tool.ty.src] -exclude = ["codebase_rag/tests/test_cypher_queries.py", "codebase_rag/tests/test_code_retrieval.py", "codebase_rag/tests/test_call_resolver.py"] +exclude = ["codebase_rag/tests/test_cypher_queries.py", "codebase_rag/tests/test_code_retrieval.py", "codebase_rag/tests/test_call_resolver.py", "benchmarks/", "optimize/"] [tool.pytest.ini_options] asyncio_mode = "auto" @@ -113,6 +148,7 @@ dev = [ "pre-commit>=4.2.0", "pyinstaller>=6.14.1", "pylint>=4.0.4", + "pytest>=9.0.2", "radon>=6.0.1", "ruff>=0.5.5", "semgrep>=1.79.0", @@ -121,7 +157,15 @@ dev = [ "types-toml>=0.10.8.20240310", "vulture>=2.14", ] +docs = [ + "mkdocs>=1.6.1,<2", + "mkdocs-material>=9.7.3", + "mkdocs-minify-plugin>=0.8.0", +] +fuzz = [ + "atheris>=2.3.0", +] [tool.bandit] -exclude_dirs = ["codebase_rag/tests", "scripts"] +exclude_dirs = ["codebase_rag/tests", "scripts", "benchmarks", "optimize"] skips = ["B101"] diff --git a/realtime_updater.py b/realtime_updater.py index 4fd95d5bc..f3bc21f65 100644 --- a/realtime_updater.py +++ b/realtime_updater.py @@ -1,4 +1,5 @@ import sys +import threading import time from pathlib import Path from typing import Annotated @@ -14,7 +15,10 @@ from codebase_rag.config import settings from codebase_rag.constants import ( CYPHER_DELETE_CALLS, + CYPHER_DELETE_FILE, CYPHER_DELETE_MODULE, + DEFAULT_DEBOUNCE_SECONDS, + DEFAULT_MAX_WAIT_SECONDS, IGNORE_PATTERNS, IGNORE_SUFFIXES, KEY_PATH, @@ -32,11 +36,47 @@ class CodeChangeEventHandler(FileSystemEventHandler): - def __init__(self, updater: GraphUpdater): + """ + Handles file system events with debouncing to prevent redundant graph updates. + + The handler implements a hybrid debounce strategy: + - Debounce: Waits for a quiet period after the last change before processing + - Max wait: Ensures updates happen within a maximum time window, even during + continuous editing + + This prevents the graph update process from running repeatedly when a file + is saved multiple times in quick succession (common during active development). + """ + + def __init__( + self, + updater: GraphUpdater, + debounce_seconds: float = DEFAULT_DEBOUNCE_SECONDS, + max_wait_seconds: float = DEFAULT_MAX_WAIT_SECONDS, + ): self.updater = updater self.ignore_patterns = IGNORE_PATTERNS self.ignore_suffixes = IGNORE_SUFFIXES - logger.info(logs.WATCHER_ACTIVE) + + # (H) Debounce configuration + self.debounce_seconds = debounce_seconds + self.max_wait_seconds = max_wait_seconds + self.debounce_enabled = debounce_seconds > 0 + + # (H) Thread-safe state for tracking pending changes + self.timers: dict[str, threading.Timer] = {} + self.first_event_time: dict[str, float] = {} + self.pending_events: dict[str, FileSystemEvent] = {} + self.lock = threading.Lock() + + if self.debounce_enabled: + logger.info( + logs.WATCHER_DEBOUNCE_ACTIVE.format( + debounce=debounce_seconds, max_wait=max_wait_seconds + ) + ) + else: + logger.info(logs.WATCHER_ACTIVE) def _is_relevant(self, path_str: str) -> bool: path = Path(path_str) @@ -65,6 +105,99 @@ def dispatch(self, event: FileSystemEvent) -> None: if event.is_directory or not self._is_relevant(src_path): return + if not self.debounce_enabled: + # (H) No debouncing - process immediately (legacy behavior) + self._process_change(event) + return + + # (H) Debounced processing with hybrid approach + path = Path(src_path) + relative_path_str = str(path.relative_to(self.updater.repo_path)) + current_time = time.time() + + with self.lock: + # (H) Track the first event time for max-wait calculation + if relative_path_str not in self.first_event_time: + self.first_event_time[relative_path_str] = current_time + logger.info( + logs.CHANGE_DEBOUNCING.format( + event_type=event.event_type, + name=path.name, + debounce=self.debounce_seconds, + ) + ) + + # (H) Always store the latest event for this file + self.pending_events[relative_path_str] = event + + # (H) Cancel any existing timer for this file + if relative_path_str in self.timers: + self.timers[relative_path_str].cancel() + logger.debug(logs.DEBOUNCE_RESET.format(path=relative_path_str)) + + # (H) Check if max wait time has been exceeded + time_since_first = current_time - self.first_event_time[relative_path_str] + + if time_since_first >= self.max_wait_seconds: + # (H) Max wait exceeded - process immediately + logger.info( + logs.DEBOUNCE_MAX_WAIT.format( + max_wait=self.max_wait_seconds, path=relative_path_str + ) + ) + self._schedule_immediate_processing(relative_path_str) + else: + # (H) Schedule debounced processing + remaining_wait = self.max_wait_seconds - time_since_first + effective_delay = min(self.debounce_seconds, remaining_wait) + timer = threading.Timer( + effective_delay, + self._process_debounced_change, + args=[relative_path_str], + ) + timer.daemon = True + self.timers[relative_path_str] = timer + timer.start() + + logger.debug( + logs.DEBOUNCE_SCHEDULED.format( + path=relative_path_str, + debounce=self.debounce_seconds, + remaining=f"{remaining_wait:.1f}", + ) + ) + + def _schedule_immediate_processing(self, relative_path_str: str) -> None: + """Process a file change immediately (called when max wait is exceeded).""" + # (H) Use a zero-delay timer to process in the timer thread + timer = threading.Timer( + 0, self._process_debounced_change, args=[relative_path_str] + ) + timer.daemon = True + self.timers[relative_path_str] = timer + timer.start() + + def _process_debounced_change(self, relative_path_str: str) -> None: + """Process a debounced file change after the timer fires.""" + with self.lock: + # (H) Retrieve and clear pending state for this file + event = self.pending_events.pop(relative_path_str, None) + self.first_event_time.pop(relative_path_str, None) + self.timers.pop(relative_path_str, None) + + if event is None: + logger.warning(logs.DEBOUNCE_NO_EVENT.format(path=relative_path_str)) + return + + logger.info(logs.DEBOUNCE_PROCESSING.format(path=relative_path_str)) + self._process_change(event) + + def _process_change(self, event: FileSystemEvent) -> None: + """Execute the actual graph update for a file change.""" + src_path = event.src_path + if isinstance(src_path, bytes): + src_path = src_path.decode() + ingestor = self.updater.ingestor if not isinstance(ingestor, QueryProtocol): logger.warning(logs.WATCHER_SKIP_NO_QUERY) @@ -73,18 +206,31 @@ def dispatch(self, event: FileSystemEvent) -> None: path = Path(src_path) relative_path_str = str(path.relative_to(self.updater.repo_path)) + # (H) Only process events that actually change file content + # (H) Skip read-only events like "opened", "closed_no_write" that don't modify the file + relevant_events = { + EventType.MODIFIED, + EventType.CREATED, + EventType.DELETED, # (H) watchdog deletion event + } + if event.event_type not in relevant_events: + return + logger.warning( logs.CHANGE_DETECTED.format(event_type=event.event_type, path=path) ) - # (H) Step 1 + # (H) Step 1: Delete existing nodes for this file path + # (H) Delete Module node and its children (for code files) ingestor.execute_write(CYPHER_DELETE_MODULE, {KEY_PATH: relative_path_str}) + # (H) Delete File node (for all files including non-code like .md, .json) + ingestor.execute_write(CYPHER_DELETE_FILE, {KEY_PATH: relative_path_str}) logger.debug(logs.DELETION_QUERY.format(path=relative_path_str)) - # (H) Step 2 + # (H) Step 2: Clear in-memory state self.updater.remove_file_from_state(path) - # (H) Step 3 + # (H) Step 3: Re-parse code files and create File nodes for ALL files if event.event_type in (EventType.MODIFIED, EventType.CREATED): lang_config = get_language_spec(path.suffix) if ( @@ -101,18 +247,28 @@ def dispatch(self, event: FileSystemEvent) -> None: root_node, language = result self.updater.ast_cache[path] = (root_node, language) + # (H) Create File node for ALL files (code and non-code like .md, .json, etc.) + self.updater.factory.structure_processor.process_generic_file( + path, path.name + ) + # (H) Step 4 logger.info(logs.RECALC_CALLS) ingestor.execute_write(CYPHER_DELETE_CALLS) self.updater._process_function_calls() - # (H) Step 5 + # (H) Step 5: Flush changes to database self.updater.ingestor.flush_all() logger.success(logs.GRAPH_UPDATED.format(name=path.name)) def start_watcher( - repo_path: str, host: str, port: int, batch_size: int | None = None + repo_path: str, + host: str, + port: int, + batch_size: int | None = None, + debounce_seconds: float = DEFAULT_DEBOUNCE_SECONDS, + max_wait_seconds: float = DEFAULT_MAX_WAIT_SECONDS, ) -> None: repo_path_obj = Path(repo_path).resolve() parsers, queries = load_parsers() @@ -123,11 +279,27 @@ def start_watcher( host=host, port=port, batch_size=effective_batch_size, + username=settings.MEMGRAPH_USERNAME, + password=settings.MEMGRAPH_PASSWORD, ) as ingestor: - _run_watcher_loop(ingestor, repo_path_obj, parsers, queries) + _run_watcher_loop( + ingestor, + repo_path_obj, + parsers, + queries, + debounce_seconds, + max_wait_seconds, + ) -def _run_watcher_loop(ingestor, repo_path_obj, parsers, queries): +def _run_watcher_loop( + ingestor, + repo_path_obj, + parsers, + queries, + debounce_seconds: float, + max_wait_seconds: float, +): updater = GraphUpdater(ingestor, repo_path_obj, parsers, queries) # (H) Initial full scan builds the complete context for real-time updates @@ -135,7 +307,11 @@ def _run_watcher_loop(ingestor, repo_path_obj, parsers, queries): updater.run() logger.success(logs.INITIAL_SCAN_DONE) - event_handler = CodeChangeEventHandler(updater) + event_handler = CodeChangeEventHandler( + updater, + debounce_seconds=debounce_seconds, + max_wait_seconds=max_wait_seconds, + ) observer = Observer() observer.schedule(event_handler, str(repo_path_obj), recursive=True) observer.start() @@ -157,6 +333,12 @@ def _validate_positive_int(value: int | None) -> int | None: return value +def _validate_non_negative_float(value: float) -> float: + if value < 0: + raise typer.BadParameter(te.INVALID_NON_NEGATIVE_FLOAT.format(value=value)) + return value + + def main( repo_path: Annotated[str, typer.Argument(help=ch.HELP_REPO_PATH_WATCH)], host: Annotated[ @@ -172,11 +354,62 @@ def main( callback=_validate_positive_int, ), ] = None, + debounce: Annotated[ + float, + typer.Option( + "--debounce", + "-d", + help=ch.HELP_DEBOUNCE, + callback=_validate_non_negative_float, + ), + ] = DEFAULT_DEBOUNCE_SECONDS, + max_wait: Annotated[ + float, + typer.Option( + "--max-wait", + "-m", + help=ch.HELP_MAX_WAIT, + callback=_validate_non_negative_float, + ), + ] = DEFAULT_MAX_WAIT_SECONDS, ) -> None: + """ + Watch a repository for file changes and update the knowledge graph in real-time. + + The watcher uses a hybrid debouncing strategy to efficiently handle rapid file saves: + + - DEBOUNCE: After a file change, waits for a quiet period before processing. + This batches rapid saves into a single update. + + - MAX_WAIT: Ensures updates happen within a maximum time window, even during + continuous editing. Prevents indefinite delays. + + Examples: + + # Default settings (5s debounce, 30s max wait) + python realtime_updater.py /path/to/repo + + # More aggressive batching for background monitoring + python realtime_updater.py /path/to/repo --debounce 10 --max-wait 60 + + # Quick feedback for demos + python realtime_updater.py /path/to/repo --debounce 2 --max-wait 10 + + # Disable debouncing (legacy behavior) + python realtime_updater.py /path/to/repo --debounce 0 + """ logger.remove() logger.add(sys.stdout, format=REALTIME_LOGGER_FORMAT, level=LOG_LEVEL_INFO) logger.info(logs.LOGGER_CONFIGURED) - start_watcher(repo_path, host, port, batch_size) + + # (H) Validate max_wait is greater than debounce when both are enabled + if debounce > 0 and max_wait > 0 and max_wait < debounce: + logger.warning( + logs.DEBOUNCE_MAX_WAIT_ADJUSTED.format(max_wait=max_wait, debounce=debounce) + ) + max_wait = debounce + + start_watcher(repo_path, host, port, batch_size, debounce, max_wait) if __name__ == "__main__": diff --git a/scripts/hooks/generate_readme.py b/scripts/hooks/generate_readme.py index 88394ff55..51d6bbeec 100644 --- a/scripts/hooks/generate_readme.py +++ b/scripts/hooks/generate_readme.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import hashlib import subprocess import sys from pathlib import Path @@ -6,6 +7,8 @@ repo_root = Path(__file__).parent.parent.parent readme_path = repo_root / "README.md" +before = hashlib.sha256(readme_path.read_bytes()).hexdigest() + result = subprocess.run( ["uv", "run", "python", "scripts/generate_readme.py"], check=False, @@ -18,5 +21,9 @@ sys.stderr.write(result.stderr) sys.exit(result.returncode) -subprocess.run(["git", "add", "README.md"], cwd=repo_root, check=True) +after = hashlib.sha256(readme_path.read_bytes()).hexdigest() + +if before != after: + subprocess.run(["git", "add", "README.md"], cwd=repo_root, check=True) + sys.exit(1) sys.exit(0) diff --git a/server.json b/server.json new file mode 100644 index 000000000..91ec2c0c4 --- /dev/null +++ b/server.json @@ -0,0 +1,78 @@ +{ + "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json", + "name": "io.github.vitali87/code-graph-rag", + "title": "Code-Graph-RAG", + "description": "Graph-based RAG system for multi-language codebases. Parse, index, query, and edit code using knowledge graphs and natural language.", + "websiteUrl": "https://code-graph-rag.com", + "repository": { + "url": "https://github.com/vitali87/code-graph-rag", + "source": "github" + }, + "version": "0.0.184", + "packages": [ + { + "registryType": "pypi", + "registryBaseUrl": "https://pypi.org", + "identifier": "code-graph-rag", + "version": "0.0.184", + "runtimeHint": "uvx", + "transport": { + "type": "stdio" + }, + "packageArguments": [ + { + "type": "positional", + "value": "mcp-server" + } + ], + "environmentVariables": [ + { + "name": "ORCHESTRATOR_PROVIDER", + "description": "LLM provider for the orchestrator agent (openai, anthropic, google, azure, cohere, ollama)", + "default": "anthropic" + }, + { + "name": "ORCHESTRATOR_MODEL", + "description": "Model name for the orchestrator agent", + "default": "claude-sonnet-4-20250514" + }, + { + "name": "ORCHESTRATOR_API_KEY", + "description": "API key for the orchestrator LLM provider", + "isRequired": true, + "isSecret": true + }, + { + "name": "CYPHER_PROVIDER", + "description": "LLM provider for Cypher query generation (openai, anthropic, google, azure, cohere, ollama)", + "default": "anthropic" + }, + { + "name": "CYPHER_MODEL", + "description": "Model name for Cypher query generation", + "default": "claude-sonnet-4-20250514" + }, + { + "name": "CYPHER_API_KEY", + "description": "API key for the Cypher LLM provider", + "isRequired": true, + "isSecret": true + }, + { + "name": "MEMGRAPH_HOST", + "description": "Hostname of the Memgraph database", + "default": "localhost" + }, + { + "name": "MEMGRAPH_PORT", + "description": "Port of the Memgraph database", + "default": "7687" + }, + { + "name": "TARGET_REPO_PATH", + "description": "Path to the repository to analyze (auto-detected from working directory if not set)" + } + ] + } + ] +} diff --git a/sonar-project.properties b/sonar-project.properties new file mode 100644 index 000000000..796dc31c5 --- /dev/null +++ b/sonar-project.properties @@ -0,0 +1,13 @@ +sonar.projectKey=vitali87_code-graph-rag +sonar.organization=vitali87 +sonar.projectName=code-graph-rag + +sonar.sources=codebase_rag +sonar.tests=codebase_rag/tests +sonar.exclusions=**/__pycache__/**,**/*.pyc,codebase_rag/tests/** +sonar.security.exclusions=codebase_rag/tests/** + +sonar.python.version=3.12 +sonar.python.coverage.reportPaths=coverage.xml + +sonar.sourceEncoding=UTF-8 diff --git a/uv.lock b/uv.lock index aa1977b86..3289915d2 100644 --- a/uv.lock +++ b/uv.lock @@ -146,7 +146,7 @@ wheels = [ [[package]] name = "anthropic" -version = "0.76.0" +version = "0.86.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -158,9 +158,9 @@ dependencies = [ { name = "sniffio" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6e/be/d11abafaa15d6304826438170f7574d750218f49a106c54424a40cef4494/anthropic-0.76.0.tar.gz", hash = "sha256:e0cae6a368986d5cf6df743dfbb1b9519e6a9eee9c6c942ad8121c0b34416ffe", size = 495483, upload-time = "2026-01-13T18:41:14.908Z" } +sdist = { url = "https://files.pythonhosted.org/packages/37/7a/8b390dc47945d3169875d342847431e5f7d5fa716b2e37494d57cfc1db10/anthropic-0.86.0.tar.gz", hash = "sha256:60023a7e879aa4fbb1fed99d487fe407b2ebf6569603e5047cfe304cebdaa0e5", size = 583820, upload-time = "2026-03-18T18:43:08.017Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/70/7b0fd9c1a738f59d3babe2b4212031c34ab7d0fda4ffef15b58a55c5bcea/anthropic-0.76.0-py3-none-any.whl", hash = "sha256:81efa3113901192af2f0fe977d3ec73fdadb1e691586306c4256cd6d5ccc331c", size = 390309, upload-time = "2026-01-13T18:41:13.483Z" }, + { url = "https://files.pythonhosted.org/packages/63/5f/67db29c6e5d16c8c9c4652d3efb934d89cb750cad201539141781d8eae14/anthropic-0.86.0-py3-none-any.whl", hash = "sha256:9d2bbd339446acce98858c5627d33056efe01f70435b22b63546fe7edae0cd57", size = 469400, upload-time = "2026-03-18T18:43:06.526Z" }, ] [[package]] @@ -194,6 +194,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/66/686ac4fc6ef48f5bacde625adac698f41d5316a9753c2b20bb0931c9d4e2/astroid-4.0.3-py3-none-any.whl", hash = "sha256:864a0a34af1bd70e1049ba1e61cee843a7252c826d97825fcee9b2fcbd9e1b14", size = 276443, upload-time = "2026-01-03T22:14:24.412Z" }, ] +[[package]] +name = "atheris" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/58/5965955898e16bee17c8379eae12194993bf641c4629016991248b862069/atheris-3.0.0.tar.gz", hash = "sha256:1f0929c7bc3040f3fe4102e557718734190cf2d7718bbb8e3ce6d3eb56ef5bb3", size = 373239, upload-time = "2025-11-24T23:54:02.15Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/8c/e9960b996e70e5f6a523670431166b2b238de52fef094955515dcf854da1/atheris-3.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:510e502c57b6dc615fb174066407af620d4c7f73cf08a782c86e7761bf12c4eb", size = 34907016, upload-time = "2025-11-24T23:53:56.535Z" }, + { url = "https://files.pythonhosted.org/packages/db/48/df670f75f458cc7c1752a01a394fd59c830b08172dd59cf29d73f31050f9/atheris-3.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a402cdca8a650d1371050b1f9552eb4cdc488d2db64950d603c4560318365eac", size = 34858525, upload-time = "2025-11-24T23:53:59.925Z" }, +] + [[package]] name = "attrs" version = "25.4.0" @@ -205,14 +215,37 @@ wheels = [ [[package]] name = "authlib" -version = "1.6.6" +version = "1.6.9" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cryptography" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/bb/9b/b1661026ff24bc641b76b78c5222d614776b0c085bcfdac9bd15a1cb4b35/authlib-1.6.6.tar.gz", hash = "sha256:45770e8e056d0f283451d9996fbb59b70d45722b45d854d58f32878d0a40c38e", size = 164894, upload-time = "2025-12-12T08:01:41.464Z" } +sdist = { url = "https://files.pythonhosted.org/packages/af/98/00d3dd826d46959ad8e32af2dbb2398868fd9fd0683c26e56d0789bd0e68/authlib-1.6.9.tar.gz", hash = "sha256:d8f2421e7e5980cc1ddb4e32d3f5fa659cfaf60d8eaf3281ebed192e4ab74f04", size = 165134, upload-time = "2026-03-02T07:44:01.998Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/23/b65f568ed0c22f1efacb744d2db1a33c8068f384b8c9b482b52ebdbc3ef6/authlib-1.6.9-py2.py3-none-any.whl", hash = "sha256:f08b4c14e08f0861dc18a32357b33fbcfd2ea86cfe3fe149484b4d764c4a0ac3", size = 244197, upload-time = "2026-03-02T07:44:00.307Z" }, +] + +[[package]] +name = "babel" +version = "2.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/51899539b6ceeeb420d40ed3cd4b7a40519404f9baf3d4ac99dc413a834b/babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d", size = 9959554, upload-time = "2026-02-01T12:30:56.078Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" }, +] + +[[package]] +name = "backrefs" +version = "6.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4e/a6/e325ec73b638d3ede4421b5445d4a0b8b219481826cc079d510100af356c/backrefs-6.2.tar.gz", hash = "sha256:f44ff4d48808b243b6c0cdc6231e22195c32f77046018141556c66f8bab72a49", size = 7012303, upload-time = "2026-02-16T19:10:15.828Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/54/51/321e821856452f7386c4e9df866f196720b1ad0c5ea1623ea7399969ae3b/authlib-1.6.6-py2.py3-none-any.whl", hash = "sha256:7d9e9bc535c13974313a87f53e8430eb6ea3d1cf6ae4f6efcd793f2e949143fd", size = 244005, upload-time = "2025-12-12T08:01:40.209Z" }, + { url = "https://files.pythonhosted.org/packages/1b/39/3765df263e08a4df37f4f43cb5aa3c6c17a4bdd42ecfe841e04c26037171/backrefs-6.2-py310-none-any.whl", hash = "sha256:0fdc7b012420b6b144410342caeb8adc54c6866cf12064abc9bb211302e496f8", size = 381075, upload-time = "2026-02-16T19:10:04.322Z" }, + { url = "https://files.pythonhosted.org/packages/0f/f0/35240571e1b67ffb19dafb29ab34150b6f59f93f717b041082cdb1bfceb1/backrefs-6.2-py311-none-any.whl", hash = "sha256:08aa7fae530c6b2361d7bdcbda1a7c454e330cc9dbcd03f5c23205e430e5c3be", size = 392874, upload-time = "2026-02-16T19:10:06.314Z" }, + { url = "https://files.pythonhosted.org/packages/e3/63/77e8c9745b4d227cce9f5e0a6f68041278c5f9b18588b35905f5f19c1beb/backrefs-6.2-py312-none-any.whl", hash = "sha256:c3f4b9cb2af8cda0d87ab4f57800b57b95428488477be164dd2b47be54db0c90", size = 398787, upload-time = "2026-02-16T19:10:08.274Z" }, + { url = "https://files.pythonhosted.org/packages/c5/71/c754b1737ad99102e03fa3235acb6cb6d3ac9d6f596cbc3e5f236705abd8/backrefs-6.2-py313-none-any.whl", hash = "sha256:12df81596ab511f783b7d87c043ce26bc5b0288cf3bb03610fe76b8189282b2b", size = 400747, upload-time = "2026-02-16T19:10:09.791Z" }, + { url = "https://files.pythonhosted.org/packages/af/75/be12ba31a6eb20dccef2320cd8ccb3f7d9013b68ba4c70156259fee9e409/backrefs-6.2-py314-none-any.whl", hash = "sha256:e5f805ae09819caa1aa0623b4a83790e7028604aa2b8c73ba602c4454e665de7", size = 412602, upload-time = "2026-02-16T19:10:12.317Z" }, + { url = "https://files.pythonhosted.org/packages/21/f8/d02f650c47d05034dcd6f9c8cf94f39598b7a89c00ecda0ecb2911bc27e9/backrefs-6.2-py39-none-any.whl", hash = "sha256:664e33cd88c6840b7625b826ecf2555f32d491800900f5a541f772c485f7cda7", size = 381077, upload-time = "2026-02-16T19:10:13.74Z" }, ] [[package]] @@ -461,7 +494,7 @@ wheels = [ [[package]] name = "code-graph-rag" -version = "0.0.58" +version = "0.0.175" source = { editable = "." } dependencies = [ { name = "click" }, @@ -477,6 +510,7 @@ dependencies = [ { name = "pymgclient" }, { name = "python-dotenv" }, { name = "rich" }, + { name = "tiktoken" }, { name = "toml" }, { name = "tree-sitter" }, { name = "tree-sitter-python" }, @@ -498,11 +532,13 @@ test = [ { name = "testcontainers" }, ] treesitter-full = [ + { name = "tree-sitter-c" }, { name = "tree-sitter-cpp" }, { name = "tree-sitter-go" }, { name = "tree-sitter-java" }, { name = "tree-sitter-javascript" }, { name = "tree-sitter-lua" }, + { name = "tree-sitter-php" }, { name = "tree-sitter-python" }, { name = "tree-sitter-rust" }, { name = "tree-sitter-scala" }, @@ -516,6 +552,7 @@ dev = [ { name = "pre-commit" }, { name = "pyinstaller" }, { name = "pylint" }, + { name = "pytest" }, { name = "radon" }, { name = "ruff" }, { name = "semgrep" }, @@ -524,43 +561,54 @@ dev = [ { name = "types-toml" }, { name = "vulture" }, ] +docs = [ + { name = "mkdocs" }, + { name = "mkdocs-material" }, + { name = "mkdocs-minify-plugin" }, +] +fuzz = [ + { name = "atheris" }, +] [package.metadata] requires-dist = [ - { name = "click", specifier = ">=8.0.0" }, + { name = "click", specifier = ">=8.3.1" }, { name = "defusedxml", specifier = ">=0.7.1" }, { name = "diff-match-patch", specifier = ">=20241021" }, - { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.36.0" }, + { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=1.7.2" }, { name = "loguru", specifier = ">=0.7.3" }, - { name = "mcp", specifier = ">=1.21.1" }, - { name = "prompt-toolkit", specifier = ">=3.0.0" }, - { name = "protobuf", specifier = ">=5.27.0" }, - { name = "pydantic-ai", specifier = ">=1.27.0" }, - { name = "pydantic-settings", specifier = ">=2.0.0" }, - { name = "pymgclient", specifier = ">=1.4.0" }, + { name = "mcp", specifier = ">=1.25.0" }, + { name = "prompt-toolkit", specifier = ">=3.0.52" }, + { name = "protobuf", specifier = ">=6.33.5" }, + { name = "pydantic-ai", specifier = ">=1.70.0" }, + { name = "pydantic-settings", specifier = ">=2.12.0" }, + { name = "pymgclient", specifier = ">=1.5.1" }, { name = "pytest", marker = "extra == 'test'", specifier = ">=8.4.1" }, { name = "pytest-asyncio", marker = "extra == 'test'", specifier = ">=1.0.0" }, { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.0.0" }, { name = "pytest-xdist", marker = "extra == 'test'", specifier = ">=3.8.0" }, - { name = "python-dotenv", specifier = ">=1.1.0" }, + { name = "python-dotenv", specifier = ">=1.2.1" }, { name = "qdrant-client", marker = "extra == 'semantic'", specifier = ">=1.9.0" }, - { name = "rich", specifier = ">=13.7.1" }, + { name = "rich", specifier = ">=14.2.0" }, { name = "testcontainers", marker = "extra == 'test'", specifier = ">=4.9.0" }, + { name = "tiktoken", specifier = ">=0.12.0" }, { name = "toml", specifier = ">=0.10.2" }, { name = "torch", marker = "extra == 'semantic'", specifier = ">=2.6.0" }, { name = "transformers", marker = "extra == 'semantic'", specifier = ">=4.0.0" }, - { name = "tree-sitter", specifier = "==0.25.0" }, + { name = "tree-sitter", specifier = "==0.25.2" }, + { name = "tree-sitter-c", marker = "extra == 'treesitter-full'", specifier = ">=0.24.1" }, { name = "tree-sitter-cpp", marker = "extra == 'treesitter-full'", specifier = ">=0.23.0" }, { name = "tree-sitter-go", marker = "extra == 'treesitter-full'", specifier = ">=0.23.4" }, { name = "tree-sitter-java", marker = "extra == 'treesitter-full'", specifier = ">=0.23.5" }, { name = "tree-sitter-javascript", marker = "extra == 'treesitter-full'", specifier = ">=0.23.1" }, { name = "tree-sitter-lua", marker = "extra == 'treesitter-full'", specifier = ">=0.0.19" }, - { name = "tree-sitter-python", specifier = ">=0.23.6" }, + { name = "tree-sitter-php", marker = "extra == 'treesitter-full'", specifier = ">=0.24.1" }, + { name = "tree-sitter-python", specifier = ">=0.25.0" }, { name = "tree-sitter-python", marker = "extra == 'treesitter-full'", specifier = ">=0.23.6" }, { name = "tree-sitter-rust", marker = "extra == 'treesitter-full'", specifier = ">=0.24.0" }, { name = "tree-sitter-scala", marker = "extra == 'treesitter-full'", specifier = ">=0.24.0" }, { name = "tree-sitter-typescript", marker = "extra == 'treesitter-full'", specifier = ">=0.23.2" }, - { name = "typer", specifier = ">=0.12.5" }, + { name = "typer", specifier = ">=0.21.1" }, { name = "watchdog", specifier = ">=6.0.0" }, ] provides-extras = ["test", "treesitter-full", "semantic"] @@ -572,6 +620,7 @@ dev = [ { name = "pre-commit", specifier = ">=4.2.0" }, { name = "pyinstaller", specifier = ">=6.14.1" }, { name = "pylint", specifier = ">=4.0.4" }, + { name = "pytest", specifier = ">=9.0.2" }, { name = "radon", specifier = ">=6.0.1" }, { name = "ruff", specifier = ">=0.5.5" }, { name = "semgrep", specifier = ">=1.79.0" }, @@ -580,10 +629,16 @@ dev = [ { name = "types-toml", specifier = ">=0.10.8.20240310" }, { name = "vulture", specifier = ">=2.14" }, ] +docs = [ + { name = "mkdocs", specifier = ">=1.6.1,<2" }, + { name = "mkdocs-material", specifier = ">=9.7.3" }, + { name = "mkdocs-minify-plugin", specifier = ">=0.8.0" }, +] +fuzz = [{ name = "atheris", specifier = ">=2.3.0" }] [[package]] name = "cohere" -version = "5.20.1" +version = "5.20.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "fastavro" }, @@ -595,9 +650,9 @@ dependencies = [ { name = "types-requests" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4b/ed/bb02083654bdc089ae4ef1cd7691fd2233f1fd9f32bcbfacc80ff57d9775/cohere-5.20.1.tar.gz", hash = "sha256:50973f63d2c6138ff52ce37d8d6f78ccc539af4e8c43865e960d68e0bf835b6f", size = 180820, upload-time = "2025-12-18T16:39:50.975Z" } +sdist = { url = "https://files.pythonhosted.org/packages/44/0b/96e2b55a0114ed9d69b3154565f54b764e7530735426290b000f467f4c0f/cohere-5.20.7.tar.gz", hash = "sha256:997ed85fabb3a1e4a4c036fdb520382e7bfa670db48eb59a026803b6f7061dbb", size = 184986, upload-time = "2026-02-25T01:22:18.673Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/e3/94eb11ac3ebaaa3a6afb5d2ff23db95d58bc468ae538c388edf49f2f20b5/cohere-5.20.1-py3-none-any.whl", hash = "sha256:d230fd13d95ba92ae927fce3dd497599b169883afc7954fe29b39fb8d5df5fc7", size = 318973, upload-time = "2025-12-18T16:39:49.504Z" }, + { url = "https://files.pythonhosted.org/packages/9d/86/dc991a75e3b9c2007b90dbfaf7f36fdb2457c216f799e26ce0474faf0c1f/cohere-5.20.7-py3-none-any.whl", hash = "sha256:043fef2a12c30c07e9b2c1f0b869fd66ffd911f58d1492f87e901c4190a65914", size = 323389, upload-time = "2026-02-25T01:22:16.902Z" }, ] [[package]] @@ -685,59 +740,62 @@ wheels = [ [[package]] name = "cryptography" -version = "46.0.3" +version = "46.0.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9f/33/c00162f49c0e2fe8064a62cb92b93e50c74a72bc370ab92f86112b33ff62/cryptography-46.0.3.tar.gz", hash = "sha256:a8b17438104fed022ce745b362294d9ce35b4c2e45c1d958ad4a4b019285f4a1", size = 749258, upload-time = "2025-10-15T23:18:31.74Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1d/42/9c391dd801d6cf0d561b5890549d4b27bafcc53b39c31a817e69d87c625b/cryptography-46.0.3-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:109d4ddfadf17e8e7779c39f9b18111a09efb969a301a31e987416a0191ed93a", size = 7225004, upload-time = "2025-10-15T23:16:52.239Z" }, - { url = "https://files.pythonhosted.org/packages/1c/67/38769ca6b65f07461eb200e85fc1639b438bdc667be02cf7f2cd6a64601c/cryptography-46.0.3-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:09859af8466b69bc3c27bdf4f5d84a665e0f7ab5088412e9e2ec49758eca5cbc", size = 4296667, upload-time = "2025-10-15T23:16:54.369Z" }, - { url = "https://files.pythonhosted.org/packages/5c/49/498c86566a1d80e978b42f0d702795f69887005548c041636df6ae1ca64c/cryptography-46.0.3-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:01ca9ff2885f3acc98c29f1860552e37f6d7c7d013d7334ff2a9de43a449315d", size = 4450807, upload-time = "2025-10-15T23:16:56.414Z" }, - { url = "https://files.pythonhosted.org/packages/4b/0a/863a3604112174c8624a2ac3c038662d9e59970c7f926acdcfaed8d61142/cryptography-46.0.3-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:6eae65d4c3d33da080cff9c4ab1f711b15c1d9760809dad6ea763f3812d254cb", size = 4299615, upload-time = "2025-10-15T23:16:58.442Z" }, - { url = "https://files.pythonhosted.org/packages/64/02/b73a533f6b64a69f3cd3872acb6ebc12aef924d8d103133bb3ea750dc703/cryptography-46.0.3-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5bf0ed4490068a2e72ac03d786693adeb909981cc596425d09032d372bcc849", size = 4016800, upload-time = "2025-10-15T23:17:00.378Z" }, - { url = "https://files.pythonhosted.org/packages/25/d5/16e41afbfa450cde85a3b7ec599bebefaef16b5c6ba4ec49a3532336ed72/cryptography-46.0.3-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:5ecfccd2329e37e9b7112a888e76d9feca2347f12f37918facbb893d7bb88ee8", size = 4984707, upload-time = "2025-10-15T23:17:01.98Z" }, - { url = "https://files.pythonhosted.org/packages/c9/56/e7e69b427c3878352c2fb9b450bd0e19ed552753491d39d7d0a2f5226d41/cryptography-46.0.3-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a2c0cd47381a3229c403062f764160d57d4d175e022c1df84e168c6251a22eec", size = 4482541, upload-time = "2025-10-15T23:17:04.078Z" }, - { url = "https://files.pythonhosted.org/packages/78/f6/50736d40d97e8483172f1bb6e698895b92a223dba513b0ca6f06b2365339/cryptography-46.0.3-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:549e234ff32571b1f4076ac269fcce7a808d3bf98b76c8dd560e42dbc66d7d91", size = 4299464, upload-time = "2025-10-15T23:17:05.483Z" }, - { url = "https://files.pythonhosted.org/packages/00/de/d8e26b1a855f19d9994a19c702fa2e93b0456beccbcfe437eda00e0701f2/cryptography-46.0.3-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:c0a7bb1a68a5d3471880e264621346c48665b3bf1c3759d682fc0864c540bd9e", size = 4950838, upload-time = "2025-10-15T23:17:07.425Z" }, - { url = "https://files.pythonhosted.org/packages/8f/29/798fc4ec461a1c9e9f735f2fc58741b0daae30688f41b2497dcbc9ed1355/cryptography-46.0.3-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:10b01676fc208c3e6feeb25a8b83d81767e8059e1fe86e1dc62d10a3018fa926", size = 4481596, upload-time = "2025-10-15T23:17:09.343Z" }, - { url = "https://files.pythonhosted.org/packages/15/8d/03cd48b20a573adfff7652b76271078e3045b9f49387920e7f1f631d125e/cryptography-46.0.3-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0abf1ffd6e57c67e92af68330d05760b7b7efb243aab8377e583284dbab72c71", size = 4426782, upload-time = "2025-10-15T23:17:11.22Z" }, - { url = "https://files.pythonhosted.org/packages/fa/b1/ebacbfe53317d55cf33165bda24c86523497a6881f339f9aae5c2e13e57b/cryptography-46.0.3-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a04bee9ab6a4da801eb9b51f1b708a1b5b5c9eb48c03f74198464c66f0d344ac", size = 4698381, upload-time = "2025-10-15T23:17:12.829Z" }, - { url = "https://files.pythonhosted.org/packages/96/92/8a6a9525893325fc057a01f654d7efc2c64b9de90413adcf605a85744ff4/cryptography-46.0.3-cp311-abi3-win32.whl", hash = "sha256:f260d0d41e9b4da1ed1e0f1ce571f97fe370b152ab18778e9e8f67d6af432018", size = 3055988, upload-time = "2025-10-15T23:17:14.65Z" }, - { url = "https://files.pythonhosted.org/packages/7e/bf/80fbf45253ea585a1e492a6a17efcb93467701fa79e71550a430c5e60df0/cryptography-46.0.3-cp311-abi3-win_amd64.whl", hash = "sha256:a9a3008438615669153eb86b26b61e09993921ebdd75385ddd748702c5adfddb", size = 3514451, upload-time = "2025-10-15T23:17:16.142Z" }, - { url = "https://files.pythonhosted.org/packages/2e/af/9b302da4c87b0beb9db4e756386a7c6c5b8003cd0e742277888d352ae91d/cryptography-46.0.3-cp311-abi3-win_arm64.whl", hash = "sha256:5d7f93296ee28f68447397bf5198428c9aeeab45705a55d53a6343455dcb2c3c", size = 2928007, upload-time = "2025-10-15T23:17:18.04Z" }, - { url = "https://files.pythonhosted.org/packages/f5/e2/a510aa736755bffa9d2f75029c229111a1d02f8ecd5de03078f4c18d91a3/cryptography-46.0.3-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:00a5e7e87938e5ff9ff5447ab086a5706a957137e6e433841e9d24f38a065217", size = 7158012, upload-time = "2025-10-15T23:17:19.982Z" }, - { url = "https://files.pythonhosted.org/packages/73/dc/9aa866fbdbb95b02e7f9d086f1fccfeebf8953509b87e3f28fff927ff8a0/cryptography-46.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c8daeb2d2174beb4575b77482320303f3d39b8e81153da4f0fb08eb5fe86a6c5", size = 4288728, upload-time = "2025-10-15T23:17:21.527Z" }, - { url = "https://files.pythonhosted.org/packages/c5/fd/bc1daf8230eaa075184cbbf5f8cd00ba9db4fd32d63fb83da4671b72ed8a/cryptography-46.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:39b6755623145ad5eff1dab323f4eae2a32a77a7abef2c5089a04a3d04366715", size = 4435078, upload-time = "2025-10-15T23:17:23.042Z" }, - { url = "https://files.pythonhosted.org/packages/82/98/d3bd5407ce4c60017f8ff9e63ffee4200ab3e23fe05b765cab805a7db008/cryptography-46.0.3-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:db391fa7c66df6762ee3f00c95a89e6d428f4d60e7abc8328f4fe155b5ac6e54", size = 4293460, upload-time = "2025-10-15T23:17:24.885Z" }, - { url = "https://files.pythonhosted.org/packages/26/e9/e23e7900983c2b8af7a08098db406cf989d7f09caea7897e347598d4cd5b/cryptography-46.0.3-cp314-cp314t-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:78a97cf6a8839a48c49271cdcbd5cf37ca2c1d6b7fdd86cc864f302b5e9bf459", size = 3995237, upload-time = "2025-10-15T23:17:26.449Z" }, - { url = "https://files.pythonhosted.org/packages/91/15/af68c509d4a138cfe299d0d7ddb14afba15233223ebd933b4bbdbc7155d3/cryptography-46.0.3-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:dfb781ff7eaa91a6f7fd41776ec37c5853c795d3b358d4896fdbb5df168af422", size = 4967344, upload-time = "2025-10-15T23:17:28.06Z" }, - { url = "https://files.pythonhosted.org/packages/ca/e3/8643d077c53868b681af077edf6b3cb58288b5423610f21c62aadcbe99f4/cryptography-46.0.3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:6f61efb26e76c45c4a227835ddeae96d83624fb0d29eb5df5b96e14ed1a0afb7", size = 4466564, upload-time = "2025-10-15T23:17:29.665Z" }, - { url = "https://files.pythonhosted.org/packages/0e/43/c1e8726fa59c236ff477ff2b5dc071e54b21e5a1e51aa2cee1676f1c986f/cryptography-46.0.3-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:23b1a8f26e43f47ceb6d6a43115f33a5a37d57df4ea0ca295b780ae8546e8044", size = 4292415, upload-time = "2025-10-15T23:17:31.686Z" }, - { url = "https://files.pythonhosted.org/packages/42/f9/2f8fefdb1aee8a8e3256a0568cffc4e6d517b256a2fe97a029b3f1b9fe7e/cryptography-46.0.3-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:b419ae593c86b87014b9be7396b385491ad7f320bde96826d0dd174459e54665", size = 4931457, upload-time = "2025-10-15T23:17:33.478Z" }, - { url = "https://files.pythonhosted.org/packages/79/30/9b54127a9a778ccd6d27c3da7563e9f2d341826075ceab89ae3b41bf5be2/cryptography-46.0.3-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:50fc3343ac490c6b08c0cf0d704e881d0d660be923fd3076db3e932007e726e3", size = 4466074, upload-time = "2025-10-15T23:17:35.158Z" }, - { url = "https://files.pythonhosted.org/packages/ac/68/b4f4a10928e26c941b1b6a179143af9f4d27d88fe84a6a3c53592d2e76bf/cryptography-46.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:22d7e97932f511d6b0b04f2bfd818d73dcd5928db509460aaf48384778eb6d20", size = 4420569, upload-time = "2025-10-15T23:17:37.188Z" }, - { url = "https://files.pythonhosted.org/packages/a3/49/3746dab4c0d1979888f125226357d3262a6dd40e114ac29e3d2abdf1ec55/cryptography-46.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d55f3dffadd674514ad19451161118fd010988540cee43d8bc20675e775925de", size = 4681941, upload-time = "2025-10-15T23:17:39.236Z" }, - { url = "https://files.pythonhosted.org/packages/fd/30/27654c1dbaf7e4a3531fa1fc77986d04aefa4d6d78259a62c9dc13d7ad36/cryptography-46.0.3-cp314-cp314t-win32.whl", hash = "sha256:8a6e050cb6164d3f830453754094c086ff2d0b2f3a897a1d9820f6139a1f0914", size = 3022339, upload-time = "2025-10-15T23:17:40.888Z" }, - { url = "https://files.pythonhosted.org/packages/f6/30/640f34ccd4d2a1bc88367b54b926b781b5a018d65f404d409aba76a84b1c/cryptography-46.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:760f83faa07f8b64e9c33fc963d790a2edb24efb479e3520c14a45741cd9b2db", size = 3494315, upload-time = "2025-10-15T23:17:42.769Z" }, - { url = "https://files.pythonhosted.org/packages/ba/8b/88cc7e3bd0a8e7b861f26981f7b820e1f46aa9d26cc482d0feba0ecb4919/cryptography-46.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:516ea134e703e9fe26bcd1277a4b59ad30586ea90c365a87781d7887a646fe21", size = 2919331, upload-time = "2025-10-15T23:17:44.468Z" }, - { url = "https://files.pythonhosted.org/packages/fd/23/45fe7f376a7df8daf6da3556603b36f53475a99ce4faacb6ba2cf3d82021/cryptography-46.0.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:cb3d760a6117f621261d662bccc8ef5bc32ca673e037c83fbe565324f5c46936", size = 7218248, upload-time = "2025-10-15T23:17:46.294Z" }, - { url = "https://files.pythonhosted.org/packages/27/32/b68d27471372737054cbd34c84981f9edbc24fe67ca225d389799614e27f/cryptography-46.0.3-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4b7387121ac7d15e550f5cb4a43aef2559ed759c35df7336c402bb8275ac9683", size = 4294089, upload-time = "2025-10-15T23:17:48.269Z" }, - { url = "https://files.pythonhosted.org/packages/26/42/fa8389d4478368743e24e61eea78846a0006caffaf72ea24a15159215a14/cryptography-46.0.3-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:15ab9b093e8f09daab0f2159bb7e47532596075139dd74365da52ecc9cb46c5d", size = 4440029, upload-time = "2025-10-15T23:17:49.837Z" }, - { url = "https://files.pythonhosted.org/packages/5f/eb/f483db0ec5ac040824f269e93dd2bd8a21ecd1027e77ad7bdf6914f2fd80/cryptography-46.0.3-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:46acf53b40ea38f9c6c229599a4a13f0d46a6c3fa9ef19fc1a124d62e338dfa0", size = 4297222, upload-time = "2025-10-15T23:17:51.357Z" }, - { url = "https://files.pythonhosted.org/packages/fd/cf/da9502c4e1912cb1da3807ea3618a6829bee8207456fbbeebc361ec38ba3/cryptography-46.0.3-cp38-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10ca84c4668d066a9878890047f03546f3ae0a6b8b39b697457b7757aaf18dbc", size = 4012280, upload-time = "2025-10-15T23:17:52.964Z" }, - { url = "https://files.pythonhosted.org/packages/6b/8f/9adb86b93330e0df8b3dcf03eae67c33ba89958fc2e03862ef1ac2b42465/cryptography-46.0.3-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:36e627112085bb3b81b19fed209c05ce2a52ee8b15d161b7c643a7d5a88491f3", size = 4978958, upload-time = "2025-10-15T23:17:54.965Z" }, - { url = "https://files.pythonhosted.org/packages/d1/a0/5fa77988289c34bdb9f913f5606ecc9ada1adb5ae870bd0d1054a7021cc4/cryptography-46.0.3-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1000713389b75c449a6e979ffc7dcc8ac90b437048766cef052d4d30b8220971", size = 4473714, upload-time = "2025-10-15T23:17:56.754Z" }, - { url = "https://files.pythonhosted.org/packages/14/e5/fc82d72a58d41c393697aa18c9abe5ae1214ff6f2a5c18ac470f92777895/cryptography-46.0.3-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:b02cf04496f6576afffef5ddd04a0cb7d49cf6be16a9059d793a30b035f6b6ac", size = 4296970, upload-time = "2025-10-15T23:17:58.588Z" }, - { url = "https://files.pythonhosted.org/packages/78/06/5663ed35438d0b09056973994f1aec467492b33bd31da36e468b01ec1097/cryptography-46.0.3-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:71e842ec9bc7abf543b47cf86b9a743baa95f4677d22baa4c7d5c69e49e9bc04", size = 4940236, upload-time = "2025-10-15T23:18:00.897Z" }, - { url = "https://files.pythonhosted.org/packages/fc/59/873633f3f2dcd8a053b8dd1d38f783043b5fce589c0f6988bf55ef57e43e/cryptography-46.0.3-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:402b58fc32614f00980b66d6e56a5b4118e6cb362ae8f3fda141ba4689bd4506", size = 4472642, upload-time = "2025-10-15T23:18:02.749Z" }, - { url = "https://files.pythonhosted.org/packages/3d/39/8e71f3930e40f6877737d6f69248cf74d4e34b886a3967d32f919cc50d3b/cryptography-46.0.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ef639cb3372f69ec44915fafcd6698b6cc78fbe0c2ea41be867f6ed612811963", size = 4423126, upload-time = "2025-10-15T23:18:04.85Z" }, - { url = "https://files.pythonhosted.org/packages/cd/c7/f65027c2810e14c3e7268353b1681932b87e5a48e65505d8cc17c99e36ae/cryptography-46.0.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3b51b8ca4f1c6453d8829e1eb7299499ca7f313900dd4d89a24b8b87c0a780d4", size = 4686573, upload-time = "2025-10-15T23:18:06.908Z" }, - { url = "https://files.pythonhosted.org/packages/0a/6e/1c8331ddf91ca4730ab3086a0f1be19c65510a33b5a441cb334e7a2d2560/cryptography-46.0.3-cp38-abi3-win32.whl", hash = "sha256:6276eb85ef938dc035d59b87c8a7dc559a232f954962520137529d77b18ff1df", size = 3036695, upload-time = "2025-10-15T23:18:08.672Z" }, - { url = "https://files.pythonhosted.org/packages/90/45/b0d691df20633eff80955a0fc7695ff9051ffce8b69741444bd9ed7bd0db/cryptography-46.0.3-cp38-abi3-win_amd64.whl", hash = "sha256:416260257577718c05135c55958b674000baef9a1c7d9e8f306ec60d71db850f", size = 3501720, upload-time = "2025-10-15T23:18:10.632Z" }, - { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" }, -] +sdist = { url = "https://files.pythonhosted.org/packages/a4/ba/04b1bd4218cbc58dc90ce967106d51582371b898690f3ae0402876cc4f34/cryptography-46.0.6.tar.gz", hash = "sha256:27550628a518c5c6c903d84f637fbecf287f6cb9ced3804838a1295dc1fd0759", size = 750542, upload-time = "2026-03-25T23:34:53.396Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/23/9285e15e3bc57325b0a72e592921983a701efc1ee8f91c06c5f0235d86d9/cryptography-46.0.6-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:64235194bad039a10bb6d2d930ab3323baaec67e2ce36215fd0952fad0930ca8", size = 7176401, upload-time = "2026-03-25T23:33:22.096Z" }, + { url = "https://files.pythonhosted.org/packages/60/f8/e61f8f13950ab6195b31913b42d39f0f9afc7d93f76710f299b5ec286ae6/cryptography-46.0.6-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:26031f1e5ca62fcb9d1fcb34b2b60b390d1aacaa15dc8b895a9ed00968b97b30", size = 4275275, upload-time = "2026-03-25T23:33:23.844Z" }, + { url = "https://files.pythonhosted.org/packages/19/69/732a736d12c2631e140be2348b4ad3d226302df63ef64d30dfdb8db7ad1c/cryptography-46.0.6-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9a693028b9cbe51b5a1136232ee8f2bc242e4e19d456ded3fa7c86e43c713b4a", size = 4425320, upload-time = "2026-03-25T23:33:25.703Z" }, + { url = "https://files.pythonhosted.org/packages/d4/12/123be7292674abf76b21ac1fc0e1af50661f0e5b8f0ec8285faac18eb99e/cryptography-46.0.6-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:67177e8a9f421aa2d3a170c3e56eca4e0128883cf52a071a7cbf53297f18b175", size = 4278082, upload-time = "2026-03-25T23:33:27.423Z" }, + { url = "https://files.pythonhosted.org/packages/5b/ba/d5e27f8d68c24951b0a484924a84c7cdaed7502bac9f18601cd357f8b1d2/cryptography-46.0.6-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:d9528b535a6c4f8ff37847144b8986a9a143585f0540fbcb1a98115b543aa463", size = 4926514, upload-time = "2026-03-25T23:33:29.206Z" }, + { url = "https://files.pythonhosted.org/packages/34/71/1ea5a7352ae516d5512d17babe7e1b87d9db5150b21f794b1377eac1edc0/cryptography-46.0.6-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:22259338084d6ae497a19bae5d4c66b7ca1387d3264d1c2c0e72d9e9b6a77b97", size = 4457766, upload-time = "2026-03-25T23:33:30.834Z" }, + { url = "https://files.pythonhosted.org/packages/01/59/562be1e653accee4fdad92c7a2e88fced26b3fdfce144047519bbebc299e/cryptography-46.0.6-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:760997a4b950ff00d418398ad73fbc91aa2894b5c1db7ccb45b4f68b42a63b3c", size = 3986535, upload-time = "2026-03-25T23:33:33.02Z" }, + { url = "https://files.pythonhosted.org/packages/d6/8b/b1ebfeb788bf4624d36e45ed2662b8bd43a05ff62157093c1539c1288a18/cryptography-46.0.6-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:3dfa6567f2e9e4c5dceb8ccb5a708158a2a871052fa75c8b78cb0977063f1507", size = 4277618, upload-time = "2026-03-25T23:33:34.567Z" }, + { url = "https://files.pythonhosted.org/packages/dd/52/a005f8eabdb28df57c20f84c44d397a755782d6ff6d455f05baa2785bd91/cryptography-46.0.6-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:cdcd3edcbc5d55757e5f5f3d330dd00007ae463a7e7aa5bf132d1f22a4b62b19", size = 4890802, upload-time = "2026-03-25T23:33:37.034Z" }, + { url = "https://files.pythonhosted.org/packages/ec/4d/8e7d7245c79c617d08724e2efa397737715ca0ec830ecb3c91e547302555/cryptography-46.0.6-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:d4e4aadb7fc1f88687f47ca20bb7227981b03afaae69287029da08096853b738", size = 4457425, upload-time = "2026-03-25T23:33:38.904Z" }, + { url = "https://files.pythonhosted.org/packages/1d/5c/f6c3596a1430cec6f949085f0e1a970638d76f81c3ea56d93d564d04c340/cryptography-46.0.6-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:2b417edbe8877cda9022dde3a008e2deb50be9c407eef034aeeb3a8b11d9db3c", size = 4405530, upload-time = "2026-03-25T23:33:40.842Z" }, + { url = "https://files.pythonhosted.org/packages/7e/c9/9f9cea13ee2dbde070424e0c4f621c091a91ffcc504ffea5e74f0e1daeff/cryptography-46.0.6-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:380343e0653b1c9d7e1f55b52aaa2dbb2fdf2730088d48c43ca1c7c0abb7cc2f", size = 4667896, upload-time = "2026-03-25T23:33:42.781Z" }, + { url = "https://files.pythonhosted.org/packages/ad/b5/1895bc0821226f129bc74d00eccfc6a5969e2028f8617c09790bf89c185e/cryptography-46.0.6-cp311-abi3-win32.whl", hash = "sha256:bcb87663e1f7b075e48c3be3ecb5f0b46c8fc50b50a97cf264e7f60242dca3f2", size = 3026348, upload-time = "2026-03-25T23:33:45.021Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f8/c9bcbf0d3e6ad288b9d9aa0b1dee04b063d19e8c4f871855a03ab3a297ab/cryptography-46.0.6-cp311-abi3-win_amd64.whl", hash = "sha256:6739d56300662c468fddb0e5e291f9b4d084bead381667b9e654c7dd81705124", size = 3483896, upload-time = "2026-03-25T23:33:46.649Z" }, + { url = "https://files.pythonhosted.org/packages/01/41/3a578f7fd5c70611c0aacba52cd13cb364a5dee895a5c1d467208a9380b0/cryptography-46.0.6-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:2ef9e69886cbb137c2aef9772c2e7138dc581fad4fcbcf13cc181eb5a3ab6275", size = 7117147, upload-time = "2026-03-25T23:33:48.249Z" }, + { url = "https://files.pythonhosted.org/packages/fa/87/887f35a6fca9dde90cad08e0de0c89263a8e59b2d2ff904fd9fcd8025b6f/cryptography-46.0.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7f417f034f91dcec1cb6c5c35b07cdbb2ef262557f701b4ecd803ee8cefed4f4", size = 4266221, upload-time = "2026-03-25T23:33:49.874Z" }, + { url = "https://files.pythonhosted.org/packages/aa/a8/0a90c4f0b0871e0e3d1ed126aed101328a8a57fd9fd17f00fb67e82a51ca/cryptography-46.0.6-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d24c13369e856b94892a89ddf70b332e0b70ad4a5c43cf3e9cb71d6d7ffa1f7b", size = 4408952, upload-time = "2026-03-25T23:33:52.128Z" }, + { url = "https://files.pythonhosted.org/packages/16/0b/b239701eb946523e4e9f329336e4ff32b1247e109cbab32d1a7b61da8ed7/cryptography-46.0.6-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:aad75154a7ac9039936d50cf431719a2f8d4ed3d3c277ac03f3339ded1a5e707", size = 4270141, upload-time = "2026-03-25T23:33:54.11Z" }, + { url = "https://files.pythonhosted.org/packages/0f/a8/976acdd4f0f30df7b25605f4b9d3d89295351665c2091d18224f7ad5cdbf/cryptography-46.0.6-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:3c21d92ed15e9cfc6eb64c1f5a0326db22ca9c2566ca46d845119b45b4400361", size = 4904178, upload-time = "2026-03-25T23:33:55.725Z" }, + { url = "https://files.pythonhosted.org/packages/b1/1b/bf0e01a88efd0e59679b69f42d4afd5bced8700bb5e80617b2d63a3741af/cryptography-46.0.6-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:4668298aef7cddeaf5c6ecc244c2302a2b8e40f384255505c22875eebb47888b", size = 4441812, upload-time = "2026-03-25T23:33:57.364Z" }, + { url = "https://files.pythonhosted.org/packages/bb/8b/11df86de2ea389c65aa1806f331cae145f2ed18011f30234cc10ca253de8/cryptography-46.0.6-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:8ce35b77aaf02f3b59c90b2c8a05c73bac12cea5b4e8f3fbece1f5fddea5f0ca", size = 3963923, upload-time = "2026-03-25T23:33:59.361Z" }, + { url = "https://files.pythonhosted.org/packages/91/e0/207fb177c3a9ef6a8108f234208c3e9e76a6aa8cf20d51932916bd43bda0/cryptography-46.0.6-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c89eb37fae9216985d8734c1afd172ba4927f5a05cfd9bf0e4863c6d5465b013", size = 4269695, upload-time = "2026-03-25T23:34:00.909Z" }, + { url = "https://files.pythonhosted.org/packages/21/5e/19f3260ed1e95bced52ace7501fabcd266df67077eeb382b79c81729d2d3/cryptography-46.0.6-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:ed418c37d095aeddf5336898a132fba01091f0ac5844e3e8018506f014b6d2c4", size = 4869785, upload-time = "2026-03-25T23:34:02.796Z" }, + { url = "https://files.pythonhosted.org/packages/10/38/cd7864d79aa1d92ef6f1a584281433419b955ad5a5ba8d1eb6c872165bcb/cryptography-46.0.6-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:69cf0056d6947edc6e6760e5f17afe4bea06b56a9ac8a06de9d2bd6b532d4f3a", size = 4441404, upload-time = "2026-03-25T23:34:04.35Z" }, + { url = "https://files.pythonhosted.org/packages/09/0a/4fe7a8d25fed74419f91835cf5829ade6408fd1963c9eae9c4bce390ecbb/cryptography-46.0.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8e7304c4f4e9490e11efe56af6713983460ee0780f16c63f219984dab3af9d2d", size = 4397549, upload-time = "2026-03-25T23:34:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/5f/a0/7d738944eac6513cd60a8da98b65951f4a3b279b93479a7e8926d9cd730b/cryptography-46.0.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b928a3ca837c77a10e81a814a693f2295200adb3352395fad024559b7be7a736", size = 4651874, upload-time = "2026-03-25T23:34:07.916Z" }, + { url = "https://files.pythonhosted.org/packages/cb/f1/c2326781ca05208845efca38bf714f76939ae446cd492d7613808badedf1/cryptography-46.0.6-cp314-cp314t-win32.whl", hash = "sha256:97c8115b27e19e592a05c45d0dd89c57f81f841cc9880e353e0d3bf25b2139ed", size = 3001511, upload-time = "2026-03-25T23:34:09.892Z" }, + { url = "https://files.pythonhosted.org/packages/c9/57/fe4a23eb549ac9d903bd4698ffda13383808ef0876cc912bcb2838799ece/cryptography-46.0.6-cp314-cp314t-win_amd64.whl", hash = "sha256:c797e2517cb7880f8297e2c0f43bb910e91381339336f75d2c1c2cbf811b70b4", size = 3471692, upload-time = "2026-03-25T23:34:11.613Z" }, + { url = "https://files.pythonhosted.org/packages/c4/cc/f330e982852403da79008552de9906804568ae9230da8432f7496ce02b71/cryptography-46.0.6-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:12cae594e9473bca1a7aceb90536060643128bb274fcea0fc459ab90f7d1ae7a", size = 7162776, upload-time = "2026-03-25T23:34:13.308Z" }, + { url = "https://files.pythonhosted.org/packages/49/b3/dc27efd8dcc4bff583b3f01d4a3943cd8b5821777a58b3a6a5f054d61b79/cryptography-46.0.6-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:639301950939d844a9e1c4464d7e07f902fe9a7f6b215bb0d4f28584729935d8", size = 4270529, upload-time = "2026-03-25T23:34:15.019Z" }, + { url = "https://files.pythonhosted.org/packages/e6/05/e8d0e6eb4f0d83365b3cb0e00eb3c484f7348db0266652ccd84632a3d58d/cryptography-46.0.6-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ed3775295fb91f70b4027aeba878d79b3e55c0b3e97eaa4de71f8f23a9f2eb77", size = 4414827, upload-time = "2026-03-25T23:34:16.604Z" }, + { url = "https://files.pythonhosted.org/packages/2f/97/daba0f5d2dc6d855e2dcb70733c812558a7977a55dd4a6722756628c44d1/cryptography-46.0.6-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8927ccfbe967c7df312ade694f987e7e9e22b2425976ddbf28271d7e58845290", size = 4271265, upload-time = "2026-03-25T23:34:18.586Z" }, + { url = "https://files.pythonhosted.org/packages/89/06/fe1fce39a37ac452e58d04b43b0855261dac320a2ebf8f5260dd55b201a9/cryptography-46.0.6-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:b12c6b1e1651e42ab5de8b1e00dc3b6354fdfd778e7fa60541ddacc27cd21410", size = 4916800, upload-time = "2026-03-25T23:34:20.561Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8a/b14f3101fe9c3592603339eb5d94046c3ce5f7fc76d6512a2d40efd9724e/cryptography-46.0.6-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:063b67749f338ca9c5a0b7fe438a52c25f9526b851e24e6c9310e7195aad3b4d", size = 4448771, upload-time = "2026-03-25T23:34:22.406Z" }, + { url = "https://files.pythonhosted.org/packages/01/b3/0796998056a66d1973fd52ee89dc1bb3b6581960a91ad4ac705f182d398f/cryptography-46.0.6-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:02fad249cb0e090b574e30b276a3da6a149e04ee2f049725b1f69e7b8351ec70", size = 3978333, upload-time = "2026-03-25T23:34:24.281Z" }, + { url = "https://files.pythonhosted.org/packages/c5/3d/db200af5a4ffd08918cd55c08399dc6c9c50b0bc72c00a3246e099d3a849/cryptography-46.0.6-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:7e6142674f2a9291463e5e150090b95a8519b2fb6e6aaec8917dd8d094ce750d", size = 4271069, upload-time = "2026-03-25T23:34:25.895Z" }, + { url = "https://files.pythonhosted.org/packages/d7/18/61acfd5b414309d74ee838be321c636fe71815436f53c9f0334bf19064fa/cryptography-46.0.6-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:456b3215172aeefb9284550b162801d62f5f264a081049a3e94307fe20792cfa", size = 4878358, upload-time = "2026-03-25T23:34:27.67Z" }, + { url = "https://files.pythonhosted.org/packages/8b/65/5bf43286d566f8171917cae23ac6add941654ccf085d739195a4eacf1674/cryptography-46.0.6-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:341359d6c9e68834e204ceaf25936dffeafea3829ab80e9503860dcc4f4dac58", size = 4448061, upload-time = "2026-03-25T23:34:29.375Z" }, + { url = "https://files.pythonhosted.org/packages/e0/25/7e49c0fa7205cf3597e525d156a6bce5b5c9de1fd7e8cb01120e459f205a/cryptography-46.0.6-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9a9c42a2723999a710445bc0d974e345c32adfd8d2fac6d8a251fa829ad31cfb", size = 4399103, upload-time = "2026-03-25T23:34:32.036Z" }, + { url = "https://files.pythonhosted.org/packages/44/46/466269e833f1c4718d6cd496ffe20c56c9c8d013486ff66b4f69c302a68d/cryptography-46.0.6-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6617f67b1606dfd9fe4dbfa354a9508d4a6d37afe30306fe6c101b7ce3274b72", size = 4659255, upload-time = "2026-03-25T23:34:33.679Z" }, + { url = "https://files.pythonhosted.org/packages/0a/09/ddc5f630cc32287d2c953fc5d32705e63ec73e37308e5120955316f53827/cryptography-46.0.6-cp38-abi3-win32.whl", hash = "sha256:7f6690b6c55e9c5332c0b59b9c8a3fb232ebf059094c17f9019a51e9827df91c", size = 3010660, upload-time = "2026-03-25T23:34:35.418Z" }, + { url = "https://files.pythonhosted.org/packages/1b/82/ca4893968aeb2709aacfb57a30dec6fa2ab25b10fa9f064b8882ce33f599/cryptography-46.0.6-cp38-abi3-win_amd64.whl", hash = "sha256:79e865c642cfc5c0b3eb12af83c35c5aeff4fa5c672dc28c43721c2c9fdd2f0f", size = 3471160, upload-time = "2026-03-25T23:34:37.191Z" }, +] + +[[package]] +name = "csscompressor" +version = "0.9.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/2a/8c3ac3d8bc94e6de8d7ae270bb5bc437b210bb9d6d9e46630c98f4abd20c/csscompressor-0.9.5.tar.gz", hash = "sha256:afa22badbcf3120a4f392e4d22f9fff485c044a1feda4a950ecc5eba9dd31a05", size = 237808, upload-time = "2017-11-26T21:13:08.238Z" } [[package]] name = "cuda-bindings" @@ -1135,6 +1193,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/af/b11b80d02aaefc2fc6bfaabb3ae873439c90dc464b3a29eda51b969842b0/genai_prices-0.0.51-py3-none-any.whl", hash = "sha256:4e0f5892a7ec757d59f343c5dbf9675b0f9e8ed65f4fe26ac7df600e34788ca0", size = 60656, upload-time = "2026-01-13T12:49:12.867Z" }, ] +[[package]] +name = "ghp-import" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d9/29/d40217cbe2f6b1359e00c6c307bb3fc876ba74068cbab3dde77f03ca0dc4/ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343", size = 10943, upload-time = "2022-05-02T15:47:16.11Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" }, +] + [[package]] name = "glom" version = "22.1.0" @@ -1201,15 +1271,12 @@ wheels = [ ] [[package]] -name = "griffe" -version = "1.15.0" +name = "griffelib" +version = "2.0.0" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0d/0c/3a471b6e31951dce2360477420d0a8d1e00dea6cf33b70f3e8c3ab6e28e1/griffe-1.15.0.tar.gz", hash = "sha256:7726e3afd6f298fbc3696e67958803e7ac843c1cfe59734b6251a40cdbfb5eea", size = 424112, upload-time = "2025-11-10T15:03:15.52Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ad/06/eccbd311c9e2b3ca45dbc063b93134c57a1ccc7607c5e545264ad092c4a9/griffelib-2.0.0.tar.gz", hash = "sha256:e504d637a089f5cab9b5daf18f7645970509bf4f53eda8d79ed71cce8bd97934", size = 166312, upload-time = "2026-03-23T21:06:55.954Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9c/83/3b1d03d36f224edded98e9affd0467630fc09d766c0e56fb1498cbb04a9b/griffe-1.15.0-py3-none-any.whl", hash = "sha256:6f6762661949411031f5fcda9593f586e6ce8340f0ba88921a0f2ef7a81eb9a3", size = 150705, upload-time = "2025-11-10T15:03:13.549Z" }, + { url = "https://files.pythonhosted.org/packages/4d/51/c936033e16d12b627ea334aaaaf42229c37620d0f15593456ab69ab48161/griffelib-2.0.0-py3-none-any.whl", hash = "sha256:01284878c966508b6d6f1dbff9b6fa607bc062d8261c5c7253cb285b06422a7f", size = 142004, upload-time = "2026-02-09T19:09:40.561Z" }, ] [[package]] @@ -1294,31 +1361,34 @@ wheels = [ [[package]] name = "hf-xet" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/a5/85ef910a0aa034a2abcfadc360ab5ac6f6bc4e9112349bd40ca97551cff0/hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649", size = 2861870, upload-time = "2025-10-24T19:04:11.422Z" }, - { url = "https://files.pythonhosted.org/packages/ea/40/e2e0a7eb9a51fe8828ba2d47fe22a7e74914ea8a0db68a18c3aa7449c767/hf_xet-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b70218dd548e9840224df5638fdc94bd033552963cfa97f9170829381179c813", size = 2717584, upload-time = "2025-10-24T19:04:09.586Z" }, - { url = "https://files.pythonhosted.org/packages/a5/7d/daf7f8bc4594fdd59a8a596f9e3886133fdc68e675292218a5e4c1b7e834/hf_xet-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d40b18769bb9a8bc82a9ede575ce1a44c75eb80e7375a01d76259089529b5dc", size = 3315004, upload-time = "2025-10-24T19:04:00.314Z" }, - { url = "https://files.pythonhosted.org/packages/b1/ba/45ea2f605fbf6d81c8b21e4d970b168b18a53515923010c312c06cd83164/hf_xet-1.2.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd3a6027d59cfb60177c12d6424e31f4b5ff13d8e3a1247b3a584bf8977e6df5", size = 3222636, upload-time = "2025-10-24T19:03:58.111Z" }, - { url = "https://files.pythonhosted.org/packages/4a/1d/04513e3cab8f29ab8c109d309ddd21a2705afab9d52f2ba1151e0c14f086/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f", size = 3408448, upload-time = "2025-10-24T19:04:20.951Z" }, - { url = "https://files.pythonhosted.org/packages/f0/7c/60a2756d7feec7387db3a1176c632357632fbe7849fce576c5559d4520c7/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832", size = 3503401, upload-time = "2025-10-24T19:04:22.549Z" }, - { url = "https://files.pythonhosted.org/packages/4e/64/48fffbd67fb418ab07451e4ce641a70de1c40c10a13e25325e24858ebe5a/hf_xet-1.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382", size = 2900866, upload-time = "2025-10-24T19:04:33.461Z" }, - { url = "https://files.pythonhosted.org/packages/e2/51/f7e2caae42f80af886db414d4e9885fac959330509089f97cccb339c6b87/hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e", size = 2861861, upload-time = "2025-10-24T19:04:19.01Z" }, - { url = "https://files.pythonhosted.org/packages/6e/1d/a641a88b69994f9371bd347f1dd35e5d1e2e2460a2e350c8d5165fc62005/hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8", size = 2717699, upload-time = "2025-10-24T19:04:17.306Z" }, - { url = "https://files.pythonhosted.org/packages/df/e0/e5e9bba7d15f0318955f7ec3f4af13f92e773fbb368c0b8008a5acbcb12f/hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0", size = 3314885, upload-time = "2025-10-24T19:04:07.642Z" }, - { url = "https://files.pythonhosted.org/packages/21/90/b7fe5ff6f2b7b8cbdf1bd56145f863c90a5807d9758a549bf3d916aa4dec/hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090", size = 3221550, upload-time = "2025-10-24T19:04:05.55Z" }, - { url = "https://files.pythonhosted.org/packages/6f/cb/73f276f0a7ce46cc6a6ec7d6c7d61cbfe5f2e107123d9bbd0193c355f106/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a", size = 3408010, upload-time = "2025-10-24T19:04:28.598Z" }, - { url = "https://files.pythonhosted.org/packages/b8/1e/d642a12caa78171f4be64f7cd9c40e3ca5279d055d0873188a58c0f5fbb9/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f", size = 3503264, upload-time = "2025-10-24T19:04:30.397Z" }, - { url = "https://files.pythonhosted.org/packages/17/b5/33764714923fa1ff922770f7ed18c2daae034d21ae6e10dbf4347c854154/hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc", size = 2901071, upload-time = "2025-10-24T19:04:37.463Z" }, - { url = "https://files.pythonhosted.org/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload-time = "2025-10-24T19:04:15.366Z" }, - { url = "https://files.pythonhosted.org/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload-time = "2025-10-24T19:04:13.695Z" }, - { url = "https://files.pythonhosted.org/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload-time = "2025-10-24T19:04:03.596Z" }, - { url = "https://files.pythonhosted.org/packages/46/92/3f7ec4a1b6a65bf45b059b6d4a5d38988f63e193056de2f420137e3c3244/hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c", size = 3229054, upload-time = "2025-10-24T19:04:01.949Z" }, - { url = "https://files.pythonhosted.org/packages/0b/dd/7ac658d54b9fb7999a0ccb07ad863b413cbaf5cf172f48ebcd9497ec7263/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737", size = 3413812, upload-time = "2025-10-24T19:04:24.585Z" }, - { url = "https://files.pythonhosted.org/packages/92/68/89ac4e5b12a9ff6286a12174c8538a5930e2ed662091dd2572bbe0a18c8a/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865", size = 3508920, upload-time = "2025-10-24T19:04:26.927Z" }, - { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" }, +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/08/23c84a26716382c89151b5b447b4beb19e3345f3a93d3b73009a71a57ad3/hf_xet-1.4.2.tar.gz", hash = "sha256:b7457b6b482d9e0743bd116363239b1fa904a5e65deede350fbc0c4ea67c71ea", size = 672357, upload-time = "2026-03-13T06:58:51.077Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/06/e8cf74c3c48e5485c7acc5a990d0d8516cdfb5fdf80f799174f1287cc1b5/hf_xet-1.4.2-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ac8202ae1e664b2c15cdfc7298cbb25e80301ae596d602ef7870099a126fcad4", size = 3796125, upload-time = "2026-03-13T06:58:33.177Z" }, + { url = "https://files.pythonhosted.org/packages/66/d4/b73ebab01cbf60777323b7de9ef05550790451eb5172a220d6b9845385ec/hf_xet-1.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6d2f8ee39fa9fba9af929f8c0d0482f8ee6e209179ad14a909b6ad78ffcb7c81", size = 3555985, upload-time = "2026-03-13T06:58:31.797Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e7/ded6d1bd041c3f2bca9e913a0091adfe32371988e047dd3a68a2463c15a2/hf_xet-1.4.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4642a6cf249c09da8c1f87fe50b24b2a3450b235bf8adb55700b52f0ea6e2eb6", size = 4212085, upload-time = "2026-03-13T06:58:24.323Z" }, + { url = "https://files.pythonhosted.org/packages/97/c1/a0a44d1f98934f7bdf17f7a915b934f9fca44bb826628c553589900f6df8/hf_xet-1.4.2-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:769431385e746c92dc05492dde6f687d304584b89c33d79def8367ace06cb555", size = 3988266, upload-time = "2026-03-13T06:58:22.887Z" }, + { url = "https://files.pythonhosted.org/packages/7a/82/be713b439060e7d1f1d93543c8053d4ef2fe7e6922c5b31642eaa26f3c4b/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c9dd1c1bc4cc56168f81939b0e05b4c36dd2d28c13dc1364b17af89aa0082496", size = 4188513, upload-time = "2026-03-13T06:58:40.858Z" }, + { url = "https://files.pythonhosted.org/packages/21/a6/cbd4188b22abd80ebd0edbb2b3e87f2633e958983519980815fb8314eae5/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:fca58a2ae4e6f6755cc971ac6fcdf777ea9284d7e540e350bb000813b9a3008d", size = 4428287, upload-time = "2026-03-13T06:58:42.601Z" }, + { url = "https://files.pythonhosted.org/packages/b2/4e/84e45b25e2e3e903ed3db68d7eafa96dae9a1d1f6d0e7fc85120347a852f/hf_xet-1.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:163aab46854ccae0ab6a786f8edecbbfbaa38fcaa0184db6feceebf7000c93c0", size = 3665574, upload-time = "2026-03-13T06:58:53.881Z" }, + { url = "https://files.pythonhosted.org/packages/ee/71/c5ac2b9a7ae39c14e91973035286e73911c31980fe44e7b1d03730c00adc/hf_xet-1.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:09b138422ecbe50fd0c84d4da5ff537d27d487d3607183cd10e3e53f05188e82", size = 3528760, upload-time = "2026-03-13T06:58:52.187Z" }, + { url = "https://files.pythonhosted.org/packages/1e/0f/fcd2504015eab26358d8f0f232a1aed6b8d363a011adef83fe130bff88f7/hf_xet-1.4.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:949dcf88b484bb9d9276ca83f6599e4aa03d493c08fc168c124ad10b2e6f75d7", size = 3796493, upload-time = "2026-03-13T06:58:39.267Z" }, + { url = "https://files.pythonhosted.org/packages/82/56/19c25105ff81731ca6d55a188b5de2aa99d7a2644c7aa9de1810d5d3b726/hf_xet-1.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:41659966020d59eb9559c57de2cde8128b706a26a64c60f0531fa2318f409418", size = 3555797, upload-time = "2026-03-13T06:58:37.546Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e3/8933c073186849b5e06762aa89847991d913d10a95d1603eb7f2c3834086/hf_xet-1.4.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c588e21d80010119458dd5d02a69093f0d115d84e3467efe71ffb2c67c19146", size = 4212127, upload-time = "2026-03-13T06:58:30.539Z" }, + { url = "https://files.pythonhosted.org/packages/eb/01/f89ebba4e369b4ed699dcb60d3152753870996f41c6d22d3d7cac01310e1/hf_xet-1.4.2-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a296744d771a8621ad1d50c098d7ab975d599800dae6d48528ba3944e5001ba0", size = 3987788, upload-time = "2026-03-13T06:58:29.139Z" }, + { url = "https://files.pythonhosted.org/packages/84/4d/8a53e5ffbc2cc33bbf755382ac1552c6d9af13f623ed125fe67cc3e6772f/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f563f7efe49588b7d0629d18d36f46d1658fe7e08dce3fa3d6526e1c98315e2d", size = 4188315, upload-time = "2026-03-13T06:58:48.017Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b8/b7a1c1b5592254bd67050632ebbc1b42cc48588bf4757cb03c2ef87e704a/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5b2e0132c56d7ee1bf55bdb638c4b62e7106f6ac74f0b786fed499d5548c5570", size = 4428306, upload-time = "2026-03-13T06:58:49.502Z" }, + { url = "https://files.pythonhosted.org/packages/a0/0c/40779e45b20e11c7c5821a94135e0207080d6b3d76e7b78ccb413c6f839b/hf_xet-1.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:2f45c712c2fa1215713db10df6ac84b49d0e1c393465440e9cb1de73ecf7bbf6", size = 3665826, upload-time = "2026-03-13T06:58:59.88Z" }, + { url = "https://files.pythonhosted.org/packages/51/4c/e2688c8ad1760d7c30f7c429c79f35f825932581bc7c9ec811436d2f21a0/hf_xet-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:6d53df40616f7168abfccff100d232e9d460583b9d86fa4912c24845f192f2b8", size = 3529113, upload-time = "2026-03-13T06:58:58.491Z" }, + { url = "https://files.pythonhosted.org/packages/b4/86/b40b83a2ff03ef05c4478d2672b1fc2b9683ff870e2b25f4f3af240f2e7b/hf_xet-1.4.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:71f02d6e4cdd07f344f6844845d78518cc7186bd2bc52d37c3b73dc26a3b0bc5", size = 3800339, upload-time = "2026-03-13T06:58:36.245Z" }, + { url = "https://files.pythonhosted.org/packages/64/2e/af4475c32b4378b0e92a587adb1aa3ec53e3450fd3e5fe0372a874531c00/hf_xet-1.4.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e9b38d876e94d4bdcf650778d6ebbaa791dd28de08db9736c43faff06ede1b5a", size = 3559664, upload-time = "2026-03-13T06:58:34.787Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4c/781267da3188db679e601de18112021a5cb16506fe86b246e22c5401a9c4/hf_xet-1.4.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:77e8c180b7ef12d8a96739a4e1e558847002afe9ea63b6f6358b2271a8bdda1c", size = 4217422, upload-time = "2026-03-13T06:58:27.472Z" }, + { url = "https://files.pythonhosted.org/packages/68/47/d6cf4a39ecf6c7705f887a46f6ef5c8455b44ad9eb0d391aa7e8a2ff7fea/hf_xet-1.4.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c3b3c6a882016b94b6c210957502ff7877802d0dbda8ad142c8595db8b944271", size = 3992847, upload-time = "2026-03-13T06:58:25.989Z" }, + { url = "https://files.pythonhosted.org/packages/2d/ef/e80815061abff54697239803948abc665c6b1d237102c174f4f7a9a5ffc5/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9d9a634cc929cfbaf2e1a50c0e532ae8c78fa98618426769480c58501e8c8ac2", size = 4193843, upload-time = "2026-03-13T06:58:44.59Z" }, + { url = "https://files.pythonhosted.org/packages/54/75/07f6aa680575d9646c4167db6407c41340cbe2357f5654c4e72a1b01ca14/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6b0932eb8b10317ea78b7da6bab172b17be03bbcd7809383d8d5abd6a2233e04", size = 4432751, upload-time = "2026-03-13T06:58:46.533Z" }, + { url = "https://files.pythonhosted.org/packages/cd/71/193eabd7e7d4b903c4aa983a215509c6114915a5a237525ec562baddb868/hf_xet-1.4.2-cp37-abi3-win_amd64.whl", hash = "sha256:ad185719fb2e8ac26f88c8100562dbf9dbdcc3d9d2add00faa94b5f106aea53f", size = 3671149, upload-time = "2026-03-13T06:58:57.07Z" }, + { url = "https://files.pythonhosted.org/packages/b4/7e/ccf239da366b37ba7f0b36095450efae4a64980bdc7ec2f51354205fdf39/hf_xet-1.4.2-cp37-abi3-win_arm64.whl", hash = "sha256:32c012286b581f783653e718c1862aea5b9eb140631685bb0c5e7012c8719a87", size = 3533426, upload-time = "2026-03-13T06:58:55.46Z" }, ] [[package]] @@ -1330,6 +1400,14 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" }, ] +[[package]] +name = "htmlmin2" +version = "0.1.13" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/31/a76f4bfa885f93b8167cb4c85cf32b54d1f64384d0b897d45bc6d19b7b45/htmlmin2-0.1.13-py3-none-any.whl", hash = "sha256:75609f2a42e64f7ce57dbff28a39890363bde9e7e5885db633317efbdf8c79a2", size = 34486, upload-time = "2023-03-14T21:28:30.388Z" }, +] + [[package]] name = "httpcore" version = "1.0.9" @@ -1374,30 +1452,28 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "0.36.0" +version = "1.7.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "httpx" }, { name = "packaging" }, { name = "pyyaml" }, - { name = "requests" }, { name = "tqdm" }, + { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/98/63/4910c5fa9128fdadf6a9c5ac138e8b1b6cee4ca44bf7915bbfbce4e355ee/huggingface_hub-0.36.0.tar.gz", hash = "sha256:47b3f0e2539c39bf5cde015d63b72ec49baff67b6931c3d97f3f84532e2b8d25", size = 463358, upload-time = "2025-10-23T12:12:01.413Z" } +sdist = { url = "https://files.pythonhosted.org/packages/19/15/eafc1c57bf0f8afffb243dcd4c0cceb785e956acc17bba4d9bf2ae21fc9c/huggingface_hub-1.7.2.tar.gz", hash = "sha256:7f7e294e9bbb822e025bdb2ada025fa4344d978175a7f78e824d86e35f7ab43b", size = 724684, upload-time = "2026-03-20T10:36:08.767Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" }, + { url = "https://files.pythonhosted.org/packages/08/de/3ad061a05f74728927ded48c90b73521b9a9328c85d841bdefb30e01fb85/huggingface_hub-1.7.2-py3-none-any.whl", hash = "sha256:288f33a0a17b2a73a1359e2a5fd28d1becb2c121748c6173ab8643fb342c850e", size = 618036, upload-time = "2026-03-20T10:36:06.824Z" }, ] [package.optional-dependencies] hf-xet = [ { name = "hf-xet" }, ] -inference = [ - { name = "aiohttp" }, -] [[package]] name = "hyperframe" @@ -1596,6 +1672,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, ] +[[package]] +name = "jsmin" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/73/e01e4c5e11ad0494f4407a3f623ad4d87714909f50b17a06ed121034ff6e/jsmin-3.0.1.tar.gz", hash = "sha256:c0959a121ef94542e807a674142606f7e90214a2b3d1eb17300244bbb5cc2bfc", size = 13925, upload-time = "2022-01-16T20:35:59.13Z" } + [[package]] name = "jsonref" version = "1.1.0" @@ -1785,6 +1867,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/f0/834e479e47e499b6478e807fb57b31cc2db696c4db30557bb6f5aea4a90b/mando-0.7.1-py2.py3-none-any.whl", hash = "sha256:26ef1d70928b6057ee3ca12583d73c63e05c49de8972d620c278a7b206581a8a", size = 28149, upload-time = "2022-02-24T08:12:25.24Z" }, ] +[[package]] +name = "markdown" +version = "3.10.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" }, +] + [[package]] name = "markdown-it-py" version = "4.0.0" @@ -1903,6 +1994,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "mergedeep" +version = "1.3.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3a/41/580bb4006e3ed0361b8151a01d324fb03f420815446c7def45d02f74c270/mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8", size = 4661, upload-time = "2021-02-05T18:55:30.623Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307", size = 6354, upload-time = "2021-02-05T18:55:29.583Z" }, +] + [[package]] name = "mistralai" version = "1.9.11" @@ -1921,6 +2021,90 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fe/76/4ce12563aea5a76016f8643eff30ab731e6656c845e9e4d090ef10c7b925/mistralai-1.9.11-py3-none-any.whl", hash = "sha256:7a3dc2b8ef3fceaa3582220234261b5c4e3e03a972563b07afa150e44a25a6d3", size = 442796, upload-time = "2025-10-02T15:53:39.134Z" }, ] +[[package]] +name = "mkdocs" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "ghp-import" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mergedeep" }, + { name = "mkdocs-get-deps" }, + { name = "packaging" }, + { name = "pathspec" }, + { name = "pyyaml" }, + { name = "pyyaml-env-tag" }, + { name = "watchdog" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bc/c6/bbd4f061bd16b378247f12953ffcb04786a618ce5e904b8c5a01a0309061/mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2", size = 3889159, upload-time = "2024-08-30T12:24:06.899Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e", size = 3864451, upload-time = "2024-08-30T12:24:05.054Z" }, +] + +[[package]] +name = "mkdocs-get-deps" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mergedeep" }, + { name = "platformdirs" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/f5/ed29cd50067784976f25ed0ed6fcd3c2ce9eb90650aa3b2796ddf7b6870b/mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c", size = 10239, upload-time = "2023-11-20T17:51:09.981Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134", size = 9521, upload-time = "2023-11-20T17:51:08.587Z" }, +] + +[[package]] +name = "mkdocs-material" +version = "9.7.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "babel" }, + { name = "backrefs" }, + { name = "colorama" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "mkdocs" }, + { name = "mkdocs-material-extensions" }, + { name = "paginate" }, + { name = "pygments" }, + { name = "pymdown-extensions" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/b4/f900fcb8e6f510241e334ca401eddcb61ed880fb6572f7f32e4228472ca1/mkdocs_material-9.7.3.tar.gz", hash = "sha256:e5f0a18319699da7e78c35e4a8df7e93537a888660f61a86bd773a7134798f22", size = 4097748, upload-time = "2026-02-24T12:06:22.646Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/1b/16ad0193079bb8a15aa1d2620813a9cd15b18de150a4ea1b2c607fb4c74d/mkdocs_material-9.7.3-py3-none-any.whl", hash = "sha256:37ebf7b4788c992203faf2e71900be3c197c70a4be9b0d72aed537b08a91dd9d", size = 9305078, upload-time = "2026-02-24T12:06:19.155Z" }, +] + +[[package]] +name = "mkdocs-material-extensions" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/9b/9b4c96d6593b2a541e1cb8b34899a6d021d208bb357042823d4d2cabdbe7/mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443", size = 11847, upload-time = "2023-11-22T19:09:45.208Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" }, +] + +[[package]] +name = "mkdocs-minify-plugin" +version = "0.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "csscompressor" }, + { name = "htmlmin2" }, + { name = "jsmin" }, + { name = "mkdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/67/fe4b77e7a8ae7628392e28b14122588beaf6078b53eb91c7ed000fd158ac/mkdocs-minify-plugin-0.8.0.tar.gz", hash = "sha256:bc11b78b8120d79e817308e2b11539d790d21445eb63df831e393f76e52e753d", size = 8366, upload-time = "2024-01-29T16:11:32.982Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/cd/2e8d0d92421916e2ea4ff97f10a544a9bd5588eb747556701c983581df13/mkdocs_minify_plugin-0.8.0-py3-none-any.whl", hash = "sha256:5fba1a3f7bd9a2142c9954a6559a57e946587b21f133165ece30ea145c66aee6", size = 6723, upload-time = "2024-01-29T16:11:31.851Z" }, +] + [[package]] name = "more-itertools" version = "10.8.0" @@ -2265,7 +2449,7 @@ wheels = [ [[package]] name = "openai" -version = "2.15.0" +version = "2.29.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2277,9 +2461,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/94/f4/4690ecb5d70023ce6bfcfeabfe717020f654bde59a775058ec6ac4692463/openai-2.15.0.tar.gz", hash = "sha256:42eb8cbb407d84770633f31bf727d4ffb4138711c670565a41663d9439174fba", size = 627383, upload-time = "2026-01-09T22:10:08.603Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b4/15/203d537e58986b5673e7f232453a2a2f110f22757b15921cbdeea392e520/openai-2.29.0.tar.gz", hash = "sha256:32d09eb2f661b38d3edd7d7e1a2943d1633f572596febe64c0cd370c86d52bec", size = 671128, upload-time = "2026-03-17T17:53:49.599Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b5/df/c306f7375d42bafb379934c2df4c2fa3964656c8c782bac75ee10c102818/openai-2.15.0-py3-none-any.whl", hash = "sha256:6ae23b932cd7230f7244e52954daa6602716d6b9bf235401a107af731baea6c3", size = 1067879, upload-time = "2026-01-09T22:10:06.446Z" }, + { url = "https://files.pythonhosted.org/packages/d0/b1/35b6f9c8cf9318e3dbb7146cc82dab4cf61182a8d5406fc9b50864362895/openai-2.29.0-py3-none-any.whl", hash = "sha256:b7c5de513c3286d17c5e29b92c4c98ceaf0d775244ac8159aeb1bddf840eb42a", size = 1141533, upload-time = "2026-03-17T17:53:47.348Z" }, ] [[package]] @@ -2439,6 +2623,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, ] +[[package]] +name = "paginate" +version = "0.5.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/46/68dde5b6bc00c1296ec6466ab27dddede6aec9af1b99090e1107091b3b84/paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945", size = 19252, upload-time = "2024-08-25T14:17:24.139Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/96/04b8e52da071d28f5e21a805b19cb9390aa17a47462ac87f5e2696b9566d/paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591", size = 13746, upload-time = "2024-08-25T14:17:22.55Z" }, +] + [[package]] name = "pathable" version = "0.4.4" @@ -2448,6 +2641,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7d/eb/b6260b31b1a96386c0a880edebe26f89669098acea8e0318bff6adb378fd/pathable-0.4.4-py3-none-any.whl", hash = "sha256:5ae9e94793b6ef5a4cbe0a7ce9dbbefc1eec38df253763fd0aeeacf2762dbbc2", size = 9592, upload-time = "2025-01-10T18:43:11.88Z" }, ] +[[package]] +name = "pathspec" +version = "1.0.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fa/36/e27608899f9b8d4dff0617b2d9ab17ca5608956ca44461ac14ac48b44015/pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645", size = 131200, upload-time = "2026-01-27T03:59:46.938Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/3c/2c197d226f9ea224a9ab8d197933f9da0ae0aac5b6e0f884e2b8d9c8e9f7/pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723", size = 55206, upload-time = "2026-01-27T03:59:45.137Z" }, +] + [[package]] name = "pathvalidate" version = "3.3.1" @@ -2684,11 +2886,11 @@ wheels = [ [[package]] name = "pyasn1" -version = "0.6.2" +version = "0.6.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/fe/b6/6e630dff89739fcd427e3f72b3d905ce0acb85a45d4ec3e2678718a3487f/pyasn1-0.6.2.tar.gz", hash = "sha256:9b59a2b25ba7e4f8197db7686c09fb33e658b98339fadb826e9512629017833b", size = 146586, upload-time = "2026-01-16T18:04:18.534Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/5f/6583902b6f79b399c9c40674ac384fd9cd77805f9e6205075f828ef11fb2/pyasn1-0.6.3.tar.gz", hash = "sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf", size = 148685, upload-time = "2026-03-17T01:06:53.382Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/44/b5/a96872e5184f354da9c84ae119971a0a4c221fe9b27a4d94bd43f2596727/pyasn1-0.6.2-py3-none-any.whl", hash = "sha256:1eb26d860996a18e9b6ed05e7aae0e9fc21619fcee6af91cca9bad4fbea224bf", size = 83371, upload-time = "2026-01-16T18:04:17.174Z" }, + { url = "https://files.pythonhosted.org/packages/5d/a0/7d793dce3fa811fe047d6ae2431c672364b462850c6235ae306c0efd025f/pyasn1-0.6.3-py3-none-any.whl", hash = "sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde", size = 83997, upload-time = "2026-03-17T01:06:52.036Z" }, ] [[package]] @@ -2734,32 +2936,32 @@ email = [ [[package]] name = "pydantic-ai" -version = "1.46.0" +version = "1.70.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic-ai-slim", extra = ["ag-ui", "anthropic", "bedrock", "cli", "cohere", "evals", "fastmcp", "google", "groq", "huggingface", "logfire", "mcp", "mistral", "openai", "retries", "temporal", "ui", "vertexai", "xai"] }, ] -sdist = { url = "https://files.pythonhosted.org/packages/7d/e9/2917eabd9a8f408748e1e91b8d0a1bf695ca7d785f6b88efc3e4bba2fa94/pydantic_ai-1.46.0.tar.gz", hash = "sha256:e71c7d7c905da6f34b8759ad9f6914c31035fed5623ca5ac35096f9d738019cf", size = 11795, upload-time = "2026-01-23T00:07:15.786Z" } +sdist = { url = "https://files.pythonhosted.org/packages/53/98/87c97dce65711f922ac448f9103a0bf7c59be67af6663450a8bee3dc824a/pydantic_ai-1.70.0.tar.gz", hash = "sha256:f06368a4fa91f6abcc11d73524dc81516b63739bd88ac93b330e16708b6f784b", size = 12297, upload-time = "2026-03-18T04:24:32.485Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4e/9e/ff49bae2eeeb7f0afe0b8bfb49868f4e4e0f2d986be5f2f9883e09c3e09b/pydantic_ai-1.46.0-py3-none-any.whl", hash = "sha256:a9ac9413ae1e57d5f9ce563f6e46aceaaf9602540366e98363d08482e4ddc651", size = 7220, upload-time = "2026-01-23T00:07:08.263Z" }, + { url = "https://files.pythonhosted.org/packages/fa/08/3a49448850ecdbc020ffa9fde9b7e4f6986c4d67488da33c17bc2150616c/pydantic_ai-1.70.0-py3-none-any.whl", hash = "sha256:d2dbac707153fcdd890e48fc31c4235b4f5f15c815fb60438b76085ffcd0205f", size = 7227, upload-time = "2026-03-18T04:24:24.543Z" }, ] [[package]] name = "pydantic-ai-slim" -version = "1.46.0" +version = "1.70.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "genai-prices" }, - { name = "griffe" }, + { name = "griffelib" }, { name = "httpx" }, { name = "opentelemetry-api" }, { name = "pydantic" }, { name = "pydantic-graph" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c6/f3/c053fef7e4d55b7b28fea5d3a738e5e6fa15f227668faed53c76226ae79a/pydantic_ai_slim-1.46.0.tar.gz", hash = "sha256:8925bc2c54b6c1f5168142d703ecfdba65162d08dae9908bf583932fdf631d09", size = 393260, upload-time = "2026-01-23T00:07:18.831Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ac/97/d57ee44976c349658ea7c645c5c2e1a26830e4b60fdeeee2669d4aaef6eb/pydantic_ai_slim-1.70.0.tar.gz", hash = "sha256:3df0c0e92f72c35e546d24795bce1f4d38f81da2d10addd2e9f255b2d2c83c91", size = 445474, upload-time = "2026-03-18T04:24:34.393Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/d8/640ccbd4d63021a7bd724571dfe92c5868e3890a1172e159b828c84c30dc/pydantic_ai_slim-1.46.0-py3-none-any.whl", hash = "sha256:2494ca9be6009a5e27db09fecb1ab49f0b569a6e7fcd2eda067262bcbd497856", size = 515335, upload-time = "2026-01-23T00:07:10.751Z" }, + { url = "https://files.pythonhosted.org/packages/da/8c/8545d28d0b3a9957aa21393cfdab8280bb854362360b296cd486ed1713ec/pydantic_ai_slim-1.70.0-py3-none-any.whl", hash = "sha256:162907092a562b3160d9ef0418d317ec941c5c0e6dd6e0aa0dbb53b5a5cd3450", size = 576244, upload-time = "2026-03-18T04:24:27.301Z" }, ] [package.optional-dependencies] @@ -2795,7 +2997,7 @@ groq = [ { name = "groq" }, ] huggingface = [ - { name = "huggingface-hub", extra = ["inference"] }, + { name = "huggingface-hub" }, ] logfire = [ { name = "logfire", extra = ["httpx"] }, @@ -2900,7 +3102,7 @@ wheels = [ [[package]] name = "pydantic-evals" -version = "1.46.0" +version = "1.70.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2910,14 +3112,14 @@ dependencies = [ { name = "pyyaml" }, { name = "rich" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/da/ce/044bde6ba4f0da335d7f7955c58b86e45ba275b009b46cd61d5b53b62f06/pydantic_evals-1.46.0.tar.gz", hash = "sha256:66c52ad006d6fa7d05f563d667d20377a46edb54ef638c2b83c7660215560f76", size = 47173, upload-time = "2026-01-23T00:07:20.254Z" } +sdist = { url = "https://files.pythonhosted.org/packages/01/46/21ab46e81cba78892c92ab71d21b61b23682e5e5fc645aa3647822abc3a5/pydantic_evals-1.70.0.tar.gz", hash = "sha256:ac42099233557344b41f6c43429294e61202490eb0ee9ebf6422dd4c7ea6d941", size = 56737, upload-time = "2026-03-18T04:24:35.643Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/81/02/23cbcb3843b51bad4ecda57e2047fbbf82743e4bd29e694a17d366648470/pydantic_evals-1.46.0-py3-none-any.whl", hash = "sha256:6a7cdfd3bf5e5d99c76fb77e3d41897b9ef90c4ee300f937509cdbeaec8e16f9", size = 56346, upload-time = "2026-01-23T00:07:12.216Z" }, + { url = "https://files.pythonhosted.org/packages/13/9a/6d5b74b602820621bb225e47d47f514d72e5ac5119e5dd740cd493e8ffa7/pydantic_evals-1.70.0-py3-none-any.whl", hash = "sha256:2f0c3c045c8c07b3d13876b8b0a64063ef14eb9ce27331694c8c1275f9c234b1", size = 67604, upload-time = "2026-03-18T04:24:29.134Z" }, ] [[package]] name = "pydantic-graph" -version = "1.46.0" +version = "1.70.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "httpx" }, @@ -2925,9 +3127,9 @@ dependencies = [ { name = "pydantic" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b9/43/09cc322c1e7cf69e8f01fc6f09f7cd952b1fb49818cf2bee556f3b5fba07/pydantic_graph-1.46.0.tar.gz", hash = "sha256:ef0d316c95bdc37af20bdf3c343fb1caee2c8b536245d712c3ed46af0734319e", size = 58455, upload-time = "2026-01-23T00:07:21.125Z" } +sdist = { url = "https://files.pythonhosted.org/packages/07/27/f7a71ca2a3705e7c24fd777959cf5515646cc5f23b5b16c886a2ed373340/pydantic_graph-1.70.0.tar.gz", hash = "sha256:3f76d9137369ef8748b0e8a6df1a08262118af20a32bc139d23e5c0509c6b711", size = 58578, upload-time = "2026-03-18T04:24:37.007Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/e9/058fd0001c2aed3675bc80d404c6171a753a4ff08bb570ec252848d6146d/pydantic_graph-1.46.0-py3-none-any.whl", hash = "sha256:cdbc609df49e2eeb9d0d4e43f87288b79ed9d021157ba639e71d862da4b71443", size = 72325, upload-time = "2026-01-23T00:07:13.807Z" }, + { url = "https://files.pythonhosted.org/packages/38/fd/19c42b60c37dfdbbf5b76c7b218e8309b43dac501f7aaf2025527ca05023/pydantic_graph-1.70.0-py3-none-any.whl", hash = "sha256:6083c1503a2587990ee1b8a15915106e3ddabc8f3f11fbc4a108a7d7496af4a5", size = 72351, upload-time = "2026-03-18T04:24:30.291Z" }, ] [[package]] @@ -3049,6 +3251,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/92/d40f5d937517cc489ad848fc4414ecccc7592e4686b9071e09e64f5e378e/pylint-4.0.4-py3-none-any.whl", hash = "sha256:63e06a37d5922555ee2c20963eb42559918c20bd2b21244e4ef426e7c43b92e0", size = 536425, upload-time = "2025-11-30T13:29:02.53Z" }, ] +[[package]] +name = "pymdown-extensions" +version = "10.21" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ba/63/06673d1eb6d8f83c0ea1f677d770e12565fb516928b4109c9e2055656a9e/pymdown_extensions-10.21.tar.gz", hash = "sha256:39f4a020f40773f6b2ff31d2cd2546c2c04d0a6498c31d9c688d2be07e1767d5", size = 853363, upload-time = "2026-02-15T20:44:06.748Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6f/2c/5b079febdc65e1c3fb2729bf958d18b45be7113828528e8a0b5850dd819a/pymdown_extensions-10.21-py3-none-any.whl", hash = "sha256:91b879f9f864d49794c2d9534372b10150e6141096c3908a455e45ca72ad9d3f", size = 268877, upload-time = "2026-02-15T20:44:05.464Z" }, +] + [[package]] name = "pymgclient" version = "1.5.1" @@ -3072,15 +3287,15 @@ wheels = [ [[package]] name = "pyopenssl" -version = "25.3.0" +version = "26.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cryptography" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/80/be/97b83a464498a79103036bc74d1038df4a7ef0e402cfaf4d5e113fb14759/pyopenssl-25.3.0.tar.gz", hash = "sha256:c981cb0a3fd84e8602d7afc209522773b94c1c2446a3c710a75b06fe1beae329", size = 184073, upload-time = "2025-09-17T00:32:21.037Z" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/11/a62e1d33b373da2b2c2cd9eb508147871c80f12b1cacde3c5d314922afdd/pyopenssl-26.0.0.tar.gz", hash = "sha256:f293934e52936f2e3413b89c6ce36df66a0b34ae1ea3a053b8c5020ff2f513fc", size = 185534, upload-time = "2026-03-15T14:28:26.353Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/81/ef2b1dfd1862567d573a4fdbc9f969067621764fbb74338496840a1d2977/pyopenssl-25.3.0-py3-none-any.whl", hash = "sha256:1fda6fc034d5e3d179d39e59c1895c9faeaf40a79de5fc4cbbfbe0d36f4a77b6", size = 57268, upload-time = "2025-09-17T00:32:19.474Z" }, + { url = "https://files.pythonhosted.org/packages/fb/7d/d4f7d908fa8415571771b30669251d57c3cf313b36a856e6d7548ae01619/pyopenssl-26.0.0-py3-none-any.whl", hash = "sha256:df94d28498848b98cc1c0ffb8ef1e71e40210d3b0a8064c9d29571ed2904bf81", size = 57969, upload-time = "2026-03-15T14:28:24.864Z" }, ] [[package]] @@ -3258,6 +3473,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] +[[package]] +name = "pyyaml-env-tag" +version = "1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/2e/79c822141bfd05a853236b504869ebc6b70159afc570e1d5a20641782eaa/pyyaml_env_tag-1.1.tar.gz", hash = "sha256:2eb38b75a2d21ee0475d6d97ec19c63287a7e140231e4214969d0eac923cd7ff", size = 5737, upload-time = "2025-05-13T15:24:01.64Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04", size = 4722, upload-time = "2025-05-13T15:23:59.629Z" }, +] + [[package]] name = "qdrant-client" version = "1.16.2" @@ -3402,7 +3629,7 @@ wheels = [ [[package]] name = "requests" -version = "2.32.5" +version = "2.33.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "certifi" }, @@ -3410,9 +3637,9 @@ dependencies = [ { name = "idna" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232, upload-time = "2026-03-25T15:10:41.586Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, + { url = "https://files.pythonhosted.org/packages/56/5d/c814546c2333ceea4ba42262d8c4d55763003e767fa169adc693bd524478/requests-2.33.0-py3-none-any.whl", hash = "sha256:3324635456fa185245e24865e810cecec7b4caf933d7eb133dcde67d48cee69b", size = 65017, upload-time = "2026-03-25T15:10:40.382Z" }, ] [[package]] @@ -3956,8 +4183,13 @@ dependencies = [ { name = "typing-extensions" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/c9/2f/0b295dd8d199ef71e6f176f576473d645d41357b7b8aa978cc6b042575df/torch-2.10.0-1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6abb224c2b6e9e27b592a1c0015c33a504b00a0e0938f1499f7f514e9b7bfb5c", size = 79498197, upload-time = "2026-02-06T17:37:27.627Z" }, - { url = "https://files.pythonhosted.org/packages/a4/1b/af5fccb50c341bd69dc016769503cb0857c1423fbe9343410dfeb65240f2/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7350f6652dfd761f11f9ecb590bfe95b573e2961f7a242eccb3c8e78348d26fe", size = 79498248, upload-time = "2026-02-06T17:37:31.982Z" }, + { url = "https://files.pythonhosted.org/packages/d3/54/a2ba279afcca44bbd320d4e73675b282fcee3d81400ea1b53934efca6462/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:13ec4add8c3faaed8d13e0574f5cd4a323c11655546f91fbe6afa77b57423574", size = 79498202, upload-time = "2026-02-10T21:44:52.603Z" }, + { url = "https://files.pythonhosted.org/packages/ec/23/2c9fe0c9c27f7f6cb865abcea8a4568f29f00acaeadfc6a37f6801f84cb4/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:e521c9f030a3774ed770a9c011751fb47c4d12029a3d6522116e48431f2ff89e", size = 79498254, upload-time = "2026-02-10T21:44:44.095Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/abada41517ce0011775f0f4eacc79659bc9bc6c361e6bfe6f7052a6b9363/torch-2.10.0-3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:98c01b8bb5e3240426dcde1446eed6f40c778091c8544767ef1168fc663a05a6", size = 915622781, upload-time = "2026-03-11T14:17:11.354Z" }, + { url = "https://files.pythonhosted.org/packages/ab/c6/4dfe238342ffdcec5aef1c96c457548762d33c40b45a1ab7033bb26d2ff2/torch-2.10.0-3-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:80b1b5bfe38eb0e9f5ff09f206dcac0a87aadd084230d4a36eea5ec5232c115b", size = 915627275, upload-time = "2026-03-11T14:16:11.325Z" }, + { url = "https://files.pythonhosted.org/packages/d8/f0/72bf18847f58f877a6a8acf60614b14935e2f156d942483af1ffc081aea0/torch-2.10.0-3-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:46b3574d93a2a8134b3f5475cfb98e2eb46771794c57015f6ad1fb795ec25e49", size = 915523474, upload-time = "2026-03-11T14:17:44.422Z" }, + { url = "https://files.pythonhosted.org/packages/f4/39/590742415c3030551944edc2ddc273ea1fdfe8ffb2780992e824f1ebee98/torch-2.10.0-3-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:b1d5e2aba4eb7f8e87fbe04f86442887f9167a35f092afe4c237dfcaaef6e328", size = 915632474, upload-time = "2026-03-11T14:15:13.666Z" }, + { url = "https://files.pythonhosted.org/packages/b6/8e/34949484f764dde5b222b7fe3fede43e4a6f0da9d7f8c370bb617d629ee2/torch-2.10.0-3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0228d20b06701c05a8f978357f657817a4a63984b0c90745def81c18aedfa591", size = 915523882, upload-time = "2026-03-11T14:14:46.311Z" }, { url = "https://files.pythonhosted.org/packages/cc/af/758e242e9102e9988969b5e621d41f36b8f258bb4a099109b7a4b4b50ea4/torch-2.10.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5fd4117d89ffd47e3dcc71e71a22efac24828ad781c7e46aaaf56bf7f2796acf", size = 145996088, upload-time = "2026-01-21T16:24:44.171Z" }, { url = "https://files.pythonhosted.org/packages/23/8e/3c74db5e53bff7ed9e34c8123e6a8bfef718b2450c35eefab85bb4a7e270/torch-2.10.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:787124e7db3b379d4f1ed54dd12ae7c741c16a4d29b49c0226a89bea50923ffb", size = 915711952, upload-time = "2026-01-21T16:23:53.503Z" }, { url = "https://files.pythonhosted.org/packages/6e/01/624c4324ca01f66ae4c7cd1b74eb16fb52596dce66dbe51eff95ef9e7a4c/torch-2.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:2c66c61f44c5f903046cc696d088e21062644cbe541c7f1c4eaae88b2ad23547", size = 113757972, upload-time = "2026-01-21T16:24:39.516Z" }, @@ -3994,45 +4226,66 @@ wheels = [ [[package]] name = "transformers" -version = "4.57.6" +version = "5.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock" }, { name = "huggingface-hub" }, { name = "numpy" }, { name = "packaging" }, { name = "pyyaml" }, { name = "regex" }, - { name = "requests" }, { name = "safetensors" }, { name = "tokenizers" }, { name = "tqdm" }, + { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c4/35/67252acc1b929dc88b6602e8c4a982e64f31e733b804c14bc24b47da35e6/transformers-4.57.6.tar.gz", hash = "sha256:55e44126ece9dc0a291521b7e5492b572e6ef2766338a610b9ab5afbb70689d3", size = 10134912, upload-time = "2026-01-16T10:38:39.284Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/1a/70e830d53ecc96ce69cfa8de38f163712d2b43ac52fbd743f39f56025c31/transformers-5.3.0.tar.gz", hash = "sha256:009555b364029da9e2946d41f1c5de9f15e6b1df46b189b7293f33a161b9c557", size = 8830831, upload-time = "2026-03-04T17:41:46.119Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/03/b8/e484ef633af3887baeeb4b6ad12743363af7cce68ae51e938e00aaa0529d/transformers-4.57.6-py3-none-any.whl", hash = "sha256:4c9e9de11333ddfe5114bc872c9f370509198acf0b87a832a0ab9458e2bd0550", size = 11993498, upload-time = "2026-01-16T10:38:31.289Z" }, + { url = "https://files.pythonhosted.org/packages/b8/88/ae8320064e32679a5429a2c9ebbc05c2bf32cefb6e076f9b07f6d685a9b4/transformers-5.3.0-py3-none-any.whl", hash = "sha256:50ac8c89c3c7033444fb3f9f53138096b997ebb70d4b5e50a2e810bf12d3d29a", size = 10661827, upload-time = "2026-03-04T17:41:42.722Z" }, ] [[package]] name = "tree-sitter" -version = "0.25.0" +version = "0.25.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/9e/20c2a00a862f1c2897a436b17edb774e831b22218083b459d0d081c9db33/tree_sitter-0.25.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ddabfff809ffc983fc9963455ba1cecc90295803e06e140a4c83e94c1fa3d960", size = 146941, upload-time = "2025-09-25T17:37:34.813Z" }, + { url = "https://files.pythonhosted.org/packages/ef/04/8512e2062e652a1016e840ce36ba1cc33258b0dcc4e500d8089b4054afec/tree_sitter-0.25.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c0c0ab5f94938a23fe81928a21cc0fac44143133ccc4eb7eeb1b92f84748331c", size = 137699, upload-time = "2025-09-25T17:37:36.349Z" }, + { url = "https://files.pythonhosted.org/packages/47/8a/d48c0414db19307b0fb3bb10d76a3a0cbe275bb293f145ee7fba2abd668e/tree_sitter-0.25.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd12d80d91d4114ca097626eb82714618dcdfacd6a5e0955216c6485c350ef99", size = 607125, upload-time = "2025-09-25T17:37:37.725Z" }, + { url = "https://files.pythonhosted.org/packages/39/d1/b95f545e9fc5001b8a78636ef942a4e4e536580caa6a99e73dd0a02e87aa/tree_sitter-0.25.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b43a9e4c89d4d0839de27cd4d6902d33396de700e9ff4c5ab7631f277a85ead9", size = 635418, upload-time = "2025-09-25T17:37:38.922Z" }, + { url = "https://files.pythonhosted.org/packages/de/4d/b734bde3fb6f3513a010fa91f1f2875442cdc0382d6a949005cd84563d8f/tree_sitter-0.25.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbb1706407c0e451c4f8cc016fec27d72d4b211fdd3173320b1ada7a6c74c3ac", size = 631250, upload-time = "2025-09-25T17:37:40.039Z" }, + { url = "https://files.pythonhosted.org/packages/46/f2/5f654994f36d10c64d50a192239599fcae46677491c8dd53e7579c35a3e3/tree_sitter-0.25.2-cp312-cp312-win_amd64.whl", hash = "sha256:6d0302550bbe4620a5dc7649517c4409d74ef18558276ce758419cf09e578897", size = 127156, upload-time = "2025-09-25T17:37:41.132Z" }, + { url = "https://files.pythonhosted.org/packages/67/23/148c468d410efcf0a9535272d81c258d840c27b34781d625f1f627e2e27d/tree_sitter-0.25.2-cp312-cp312-win_arm64.whl", hash = "sha256:0c8b6682cac77e37cfe5cf7ec388844957f48b7bd8d6321d0ca2d852994e10d5", size = 113984, upload-time = "2025-09-25T17:37:42.074Z" }, + { url = "https://files.pythonhosted.org/packages/8c/67/67492014ce32729b63d7ef318a19f9cfedd855d677de5773476caf771e96/tree_sitter-0.25.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0628671f0de69bb279558ef6b640bcfc97864fe0026d840f872728a86cd6b6cd", size = 146926, upload-time = "2025-09-25T17:37:43.041Z" }, + { url = "https://files.pythonhosted.org/packages/4e/9c/a278b15e6b263e86c5e301c82a60923fa7c59d44f78d7a110a89a413e640/tree_sitter-0.25.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f5ddcd3e291a749b62521f71fc953f66f5fd9743973fd6dd962b092773569601", size = 137712, upload-time = "2025-09-25T17:37:44.039Z" }, + { url = "https://files.pythonhosted.org/packages/54/9a/423bba15d2bf6473ba67846ba5244b988cd97a4b1ea2b146822162256794/tree_sitter-0.25.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd88fbb0f6c3a0f28f0a68d72df88e9755cf5215bae146f5a1bdc8362b772053", size = 607873, upload-time = "2025-09-25T17:37:45.477Z" }, + { url = "https://files.pythonhosted.org/packages/ed/4c/b430d2cb43f8badfb3a3fa9d6cd7c8247698187b5674008c9d67b2a90c8e/tree_sitter-0.25.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b878e296e63661c8e124177cc3084b041ba3f5936b43076d57c487822426f614", size = 636313, upload-time = "2025-09-25T17:37:46.68Z" }, + { url = "https://files.pythonhosted.org/packages/9d/27/5f97098dbba807331d666a0997662e82d066e84b17d92efab575d283822f/tree_sitter-0.25.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d77605e0d353ba3fe5627e5490f0fbfe44141bafa4478d88ef7954a61a848dae", size = 631370, upload-time = "2025-09-25T17:37:47.993Z" }, + { url = "https://files.pythonhosted.org/packages/d4/3c/87caaed663fabc35e18dc704cd0e9800a0ee2f22bd18b9cbe7c10799895d/tree_sitter-0.25.2-cp313-cp313-win_amd64.whl", hash = "sha256:463c032bd02052d934daa5f45d183e0521ceb783c2548501cf034b0beba92c9b", size = 127157, upload-time = "2025-09-25T17:37:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/d5/23/f8467b408b7988aff4ea40946a4bd1a2c1a73d17156a9d039bbaff1e2ceb/tree_sitter-0.25.2-cp313-cp313-win_arm64.whl", hash = "sha256:b3f63a1796886249bd22c559a5944d64d05d43f2be72961624278eff0dcc5cb8", size = 113975, upload-time = "2025-09-25T17:37:49.922Z" }, + { url = "https://files.pythonhosted.org/packages/07/e3/d9526ba71dfbbe4eba5e51d89432b4b333a49a1e70712aa5590cd22fc74f/tree_sitter-0.25.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:65d3c931013ea798b502782acab986bbf47ba2c452610ab0776cf4a8ef150fc0", size = 146776, upload-time = "2025-09-25T17:37:50.898Z" }, + { url = "https://files.pythonhosted.org/packages/42/97/4bd4ad97f85a23011dd8a535534bb1035c4e0bac1234d58f438e15cff51f/tree_sitter-0.25.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bda059af9d621918efb813b22fb06b3fe00c3e94079c6143fcb2c565eb44cb87", size = 137732, upload-time = "2025-09-25T17:37:51.877Z" }, + { url = "https://files.pythonhosted.org/packages/b6/19/1e968aa0b1b567988ed522f836498a6a9529a74aab15f09dd9ac1e41f505/tree_sitter-0.25.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eac4e8e4c7060c75f395feec46421eb61212cb73998dbe004b7384724f3682ab", size = 609456, upload-time = "2025-09-25T17:37:52.925Z" }, + { url = "https://files.pythonhosted.org/packages/48/b6/cf08f4f20f4c9094006ef8828555484e842fc468827ad6e56011ab668dbd/tree_sitter-0.25.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:260586381b23be33b6191a07cea3d44ecbd6c01aa4c6b027a0439145fcbc3358", size = 636772, upload-time = "2025-09-25T17:37:54.647Z" }, + { url = "https://files.pythonhosted.org/packages/57/e2/d42d55bf56360987c32bc7b16adb06744e425670b823fb8a5786a1cea991/tree_sitter-0.25.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7d2ee1acbacebe50ba0f85fff1bc05e65d877958f00880f49f9b2af38dce1af0", size = 631522, upload-time = "2025-09-25T17:37:55.833Z" }, + { url = "https://files.pythonhosted.org/packages/03/87/af9604ebe275a9345d88c3ace0cf2a1341aa3f8ef49dd9fc11662132df8a/tree_sitter-0.25.2-cp314-cp314-win_amd64.whl", hash = "sha256:4973b718fcadfb04e59e746abfbb0288694159c6aeecd2add59320c03368c721", size = 130864, upload-time = "2025-09-25T17:37:57.453Z" }, + { url = "https://files.pythonhosted.org/packages/a6/6e/e64621037357acb83d912276ffd30a859ef117f9c680f2e3cb955f47c680/tree_sitter-0.25.2-cp314-cp314-win_arm64.whl", hash = "sha256:b8d4429954a3beb3e844e2872610d2a4800ba4eb42bb1990c6a4b1949b18459f", size = 117470, upload-time = "2025-09-25T17:37:58.431Z" }, +] + +[[package]] +name = "tree-sitter-c" +version = "0.24.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/98/21/e952c3180f0fd83d09cee9e0bc29f67827c659cee45077ae06eb7d813cfc/tree-sitter-0.25.0.tar.gz", hash = "sha256:15c88775cf24db06677bafe62df058a6457d8a6dde67baa48dd3723b905e79a6", size = 177740, upload-time = "2025-07-20T13:17:48.886Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/f5/ba8cd08d717277551ade8537d3aa2a94b907c6c6e0fbcf4e4d8b1c747fa3/tree_sitter_c-0.24.1.tar.gz", hash = "sha256:7d2d0cda0b8dda428c81440c1e94367f9f13548eedca3f49768bde66b1422ad6", size = 228014, upload-time = "2025-05-24T17:32:58.384Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/75/36a4726a09aeb0477ca4a45aba4abf9705642b871539005ca91ddd68faa3/tree_sitter-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d9efacce0140ad74f97e027fb4ae693debff05f6246f3e024937f9500a0e874a", size = 147016, upload-time = "2025-07-20T13:17:33.921Z" }, - { url = "https://files.pythonhosted.org/packages/ff/5e/a549a21e459de94056cf48ca5e10e3774bc9b0460ffb3aec469a5f6001c0/tree_sitter-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82b4a5535107d2b8feee085edcafa89858faa4e1a98e94cfe1740c0ca8c28d84", size = 140832, upload-time = "2025-07-20T13:17:34.82Z" }, - { url = "https://files.pythonhosted.org/packages/d7/ed/7cc29a309e5f5cc209902c93589d29a4faeb656c7eecc1abd86842633b8f/tree_sitter-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c613372545490dfba3b3e7d934fda1156e3d16b27c0335c65a92f2b4fa6af5da", size = 617875, upload-time = "2025-07-20T13:17:35.693Z" }, - { url = "https://files.pythonhosted.org/packages/76/fc/43a61a35f021429d905ce272be9a9ea6dad6fe2c849782c53bd083a935cf/tree_sitter-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241a90c815a354594d3147012ce470cfc797695ab768e29198815e147ef3c165", size = 635857, upload-time = "2025-07-20T13:17:36.676Z" }, - { url = "https://files.pythonhosted.org/packages/9b/28/c9236c505e35b3aedb3c941a359a708c173cbedab8d843fec729bab81ed9/tree_sitter-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f0b01b5068f1888af223021ba461480df28c76f39893c8113aae2154a2b81fd", size = 632649, upload-time = "2025-07-20T13:17:37.56Z" }, - { url = "https://files.pythonhosted.org/packages/13/d3/5dff82a02646619545c4e7c9b9ec87bc126f1937760228fcf2e91f5079c7/tree_sitter-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:1807bd1dae1f50721d65b270e6ffa85de84234ae39f98f4da702db56c2627e23", size = 126785, upload-time = "2025-07-20T13:17:38.488Z" }, - { url = "https://files.pythonhosted.org/packages/71/61/4fffd405569d9c1551906766825da75a2d8f1c075be8994542d5d7ba7768/tree_sitter-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:7848be6aeab5c1d62d649506d80d0e463727cb1bb55f423e88bf317db0be8d67", size = 113615, upload-time = "2025-07-20T13:17:39.965Z" }, - { url = "https://files.pythonhosted.org/packages/7a/fd/7578088dddec9b89b60d8dfea1901f3a5dff61b66d3c637c309b6209c8db/tree_sitter-0.25.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:689a19d51103f727a545ec9ba9cd377267445859838c38ec55d159dc57e82e8a", size = 147009, upload-time = "2025-07-20T13:17:41.038Z" }, - { url = "https://files.pythonhosted.org/packages/7a/3e/6e3dac18c119acf738174a19ce91d89b34f6ad1ca1c5dd57b245ae15c935/tree_sitter-0.25.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:86288b218ef958dcafe40030d6d70c99baffaf808bd81b49de160f9724fc0ba4", size = 140828, upload-time = "2025-07-20T13:17:42.023Z" }, - { url = "https://files.pythonhosted.org/packages/fa/21/94d26f5d488d85bf5201280f82ce7de374ce30ed5d5469e57623d64ead9a/tree_sitter-0.25.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5241610319177ee2f68b8e719bf1e1b309155e126d9cd567ff84f20878d7e5d0", size = 618600, upload-time = "2025-07-20T13:17:43.203Z" }, - { url = "https://files.pythonhosted.org/packages/67/74/e852445871c0a82bfa5e3d16541e0ce6775ef458d3a8f03ab3737c661832/tree_sitter-0.25.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ae1553d652a54926f80dc0a42fba07db110bb1a3ebaf47d1c4c64f8d44dd8207", size = 636691, upload-time = "2025-07-20T13:17:44.382Z" }, - { url = "https://files.pythonhosted.org/packages/87/67/759afe10e0018aa3ca3269df0257228b2df120e3956171a3667b133f3100/tree_sitter-0.25.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ccac581551407a73a519b872553973598b69d3d237ffaf32408fb38ecb775484", size = 632730, upload-time = "2025-07-20T13:17:45.687Z" }, - { url = "https://files.pythonhosted.org/packages/8d/42/24a80dafdb32f1f7d16e3236f2ba8a2bc7b0e5c2a19c7b45f874f0980e90/tree_sitter-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:d58e912869514ebb441b15c22a13a9c78f1b69be15f6a42b1d18e3f790e5d6ba", size = 126779, upload-time = "2025-07-20T13:17:46.943Z" }, - { url = "https://files.pythonhosted.org/packages/6f/2e/6af369e9d6deab9baaa60e2fa91acf82a68c63d835a2fe4f4265674ecc53/tree_sitter-0.25.0-cp313-cp313-win_arm64.whl", hash = "sha256:a1b8302161fa8da52cfafcd7575fa7d5806a9608a0b51c7a1fe45bfe70b62d46", size = 113623, upload-time = "2025-07-20T13:17:47.718Z" }, + { url = "https://files.pythonhosted.org/packages/15/c7/c817be36306e457c2d36cc324789046390d9d8c555c38772429ffdb7d361/tree_sitter_c-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9c06ac26a1efdcc8b26a8a6970fbc6997c4071857359e5837d4c42892d45fe1e", size = 80940, upload-time = "2025-05-24T17:32:49.967Z" }, + { url = "https://files.pythonhosted.org/packages/7a/42/283909467290b24fdbc29bb32ee20e409a19a55002b43175d66d091ca1a4/tree_sitter_c-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:942bcd7cbecd810dcf7ca6f8f834391ebf0771a89479646d891ba4ca2fdfdc88", size = 86304, upload-time = "2025-05-24T17:32:51.271Z" }, + { url = "https://files.pythonhosted.org/packages/94/53/fb4f61d4e5f15ec3da85774a4df8e58d3b5b73036cf167f0203b4dd9d158/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a74cfd7a11ca5a961fafd4d751892ee65acae667d2818968a6f079397d8d28c", size = 109996, upload-time = "2025-05-24T17:32:52.119Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e8/fc541d34ee81c386c5453c2596c1763e8e9cd7cb0725f39d7dfa2276afa4/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6a807705a3978911dc7ee26a7ad36dcfacb6adfc13c190d496660ec9bd66707", size = 98137, upload-time = "2025-05-24T17:32:53.361Z" }, + { url = "https://files.pythonhosted.org/packages/32/c6/d0563319cae0d5b5780a92e2806074b24afea2a07aa4c10599b899bda3ec/tree_sitter_c-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:789781afcb710df34144f7e2a20cd80e325114b9119e3956c6bd1dd2d365df98", size = 94148, upload-time = "2025-05-24T17:32:54.855Z" }, + { url = "https://files.pythonhosted.org/packages/50/5a/6361df7f3fa2310c53a0d26b4702a261c332da16fa9d801e381e3a86e25f/tree_sitter_c-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:290bff0f9c79c966496ebae45042f77543e6e4aea725f40587a8611d566231a8", size = 84703, upload-time = "2025-05-24T17:32:56.084Z" }, + { url = "https://files.pythonhosted.org/packages/22/6a/210a302e8025ac492cbaea58d3720d66b7d8034c5d747ac5e4d2d235aa25/tree_sitter_c-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:d46bbda06f838c2dcb91daf767813671fd366b49ad84ff37db702129267b46e1", size = 82715, upload-time = "2025-05-24T17:32:57.248Z" }, ] [[package]] @@ -4113,6 +4366,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/ac/2615b858c9fc6c2f5458c6375c501392ef45c486e576985393521ca50971/tree_sitter_lua-0.4.1-cp310-abi3-win_arm64.whl", hash = "sha256:081577e4ca58f3b4f1856794f3e2f5a0955476b68a2a50baf85c9bb05b932738", size = 22752, upload-time = "2025-12-31T12:50:38.117Z" }, ] +[[package]] +name = "tree-sitter-php" +version = "0.24.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/c8/1a499038cb4036bea1d560ffbc807a6fb940261aa22296bd49a62ed8bcba/tree_sitter_php-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:d56e2dcf025450f84a2cdbf4b18a09e6cb88b92e9e6858e63de3d4133ab2e43e", size = 219550, upload-time = "2025-08-16T22:14:30.212Z" }, + { url = "https://files.pythonhosted.org/packages/ab/5e/b52f2599acb29f6899470f7137d3d491c752b88df3950fb7408aea57ddca/tree_sitter_php-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:29759c67d4c27a68c227ed82c0b7e4699617b1bd23757d50c081f81a12b4f80d", size = 229632, upload-time = "2025-08-16T22:14:31.85Z" }, + { url = "https://files.pythonhosted.org/packages/6b/58/ca290da45380bd6ba7c6b0b98cc5fc30325c32c7f14f0c93196a451b19c4/tree_sitter_php-0.24.1-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94b89832ac09f078eed2acd88598838bc51012224cbcebb916dbb6a37e74357e", size = 325351, upload-time = "2025-08-16T22:14:33Z" }, + { url = "https://files.pythonhosted.org/packages/9a/c6/fd863a7a779d0ab67688939eba0e08bff7b1ffe731288d3d3610df21217b/tree_sitter_php-0.24.1-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7a1404a30f2972498ace040b0029738b8dac45d0a12932ccb8b605eb94bafbe4", size = 313021, upload-time = "2025-08-16T22:14:34.394Z" }, + { url = "https://files.pythonhosted.org/packages/48/ed/aace12f30c4f5474a9ad0e9da85c060174e3764342c9860974bb0feb02fc/tree_sitter_php-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3e96f61462a960c78e5389c7ba6c16c25e66b465c763b8e63ad66423326c2fa7", size = 305905, upload-time = "2025-08-16T22:14:35.846Z" }, + { url = "https://files.pythonhosted.org/packages/4e/c4/6c690c33b1ae9cae9505c0a2896f046fda174d72c46bdafce6aab3b2f2e7/tree_sitter_php-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:1a1b65b72a8410d421f914ee13d38fd546a94d01cb834f69b27c78ba7589a5b5", size = 208014, upload-time = "2025-08-16T22:14:37.206Z" }, + { url = "https://files.pythonhosted.org/packages/7b/69/54c670d725c092b89e76ca6984582b6a768b128ac1859ed48141b124da1d/tree_sitter_php-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:56a70c5ef1bddb15f220a479b2f2edf3042c764b6c443921fbd7ca9174d664e3", size = 206033, upload-time = "2025-08-16T22:14:38.632Z" }, +] + [[package]] name = "tree-sitter-python" version = "0.25.0"