From 82e6b5cea6c6559a8a7497fd26696ca1dcd1c052 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Fri, 29 May 2026 14:43:07 +0200 Subject: [PATCH 1/6] feat: implement compare mode for release notes generation - Introduced a new compare mode in the release notes generator to handle branching release histories. - The compare mode activates when `from-tag-name` is provided, allowing for accurate commit selection based on the commit graph rather than timestamps. - Added functionality to extract PR numbers from commit messages in both squash and merge formats. - Updated `DataMiner` to utilize the GitHub Compare API for fetching commits unique to the current release. - Enhanced filtering logic to retain PRs and commits in compare mode, regardless of their merge dates. - Added comprehensive unit and integration tests to validate the new compare mode functionality. - Documented the purpose, usage, and configuration of the compare mode in the relevant documentation files. --- README.md | 1 + docs/features/compare_mode.md | 152 +++++++++ docs/features/tag_range.md | 1 + release_notes_generator/data/SPEC.md | 257 +++++++++++++++ release_notes_generator/data/filter.py | 53 ++-- release_notes_generator/data/miner.py | 59 +++- release_notes_generator/model/mined_data.py | 1 + tests/integration/test_compare_mode.py | 292 ++++++++++++++++++ .../data/test_filter.py | 109 ++++++- .../data/test_miner.py | 217 +++++++++++++ 10 files changed, 1111 insertions(+), 31 deletions(-) create mode 100644 docs/features/compare_mode.md create mode 100644 release_notes_generator/data/SPEC.md create mode 100644 tests/integration/test_compare_mode.py diff --git a/README.md b/README.md index 75c2db10..b54da0e9 100644 --- a/README.md +++ b/README.md @@ -183,6 +183,7 @@ Each feature is documented separately — click a name below to learn configurat | [Service Chapters](docs/features/service_chapters.md) | Quality & Warnings | Surfaces gaps: issues without PRs, unlabeled items, PRs without notes, etc. | | [Duplicity Handling](docs/features/duplicity_handling.md) | Quality & Warnings | Marks duplicate lines when the same issue appears in multiple chapters. | | [Tag Range Selection](docs/features/tag_range.md) | Time Range | Chooses scope via `tag-name`/`from-tag-name`. | +| [Compare Mode](docs/features/compare_mode.md) | Time Range | Graph-based commit selection via `repo.compare()` — correct for branching release histories (maintenance + develop in parallel). | | [Date Selection](docs/features/date_selection.md) | Time Range | Chooses scope via timestamps (`published-at` vs `created-at`). | | [Custom Row Formats](docs/features/custom_row_formats.md) | Formatting & Presentation | Controls row templates and placeholders (`{number}`, `{title}`, `{developers}`, …). | | [Custom Chapters](docs/features/custom_chapters.md) | Formatting & Presentation | Maps labels to chapter headings; aggregates multiple labels under one title. | diff --git a/docs/features/compare_mode.md b/docs/features/compare_mode.md new file mode 100644 index 00000000..9f724c9c --- /dev/null +++ b/docs/features/compare_mode.md @@ -0,0 +1,152 @@ +# Feature: Compare Mode + +## Purpose +Ensure release notes for a maintenance branch contain **only** the changes that belong +to that branch, even when a parallel development branch (`v2.7.x`) is active and produces +commits in the same timestamp window. + +## The Problem Compare Mode Solves + +The default (timestamp) approach asks GitHub: *"give me all commits/PRs since time T".* +That works perfectly when every release is on a single linear history. It breaks the +moment two release streams run in parallel. + +**Concrete example — two active streams:** + +``` +develop: +* (tag: v2.7.1) 2026-05-20 Improve Kafka consumer throughput (#1401) +* (tag: v2.7.0) 2026-05-14 Fix new service access role (#1363) +* 2026-05-07 Fix/1346 custom hive table (#1349) +| +| maintenance/v2.6.x: +| * (tag: v2.6.5) 2026-05-20 Backport: handle empty schema in Hive (#1402) +| * (tag: v2.6.4) 2026-05-14 Fix new service access role (#1363) ← cherry-pick +| * (tag: v2.6.3) 2026-05-07 Fix/1346 custom hive table (#1349) ← cherry-pick +|/ +* (tag: v2.6.0) 2026-04-21 Fixes for update-ca-certificates (#1318) +``` + +Generating release notes for **`v2.6.5`** (previous: `v2.6.4`): + +| Mode | What is fetched | Correct? | +|---|---|---| +| **Timestamp** | Everything between 2026-05-14 and 2026-05-20 on *any* branch → `#1363` (v2.7.0) + `#1401` (v2.7.1) + `#1402` | ❌ two develop PRs contaminate the patch notes | +| **Compare** | Only commits reachable from `v2.6.5` but **not** `v2.6.4` → `#1402` only | ✅ | + +--- + +## How Compare Mode Works + +### Activation + +Compare mode is active **when `from-tag-name` is explicitly provided**. When it is absent +the existing timestamp path runs unchanged. + +### Step 1 — Graph-based commit selection + +Instead of asking "what happened after time T?", the action asks GitHub: *"what commits +exist in `tag-name` that do not exist in `from-tag-name`?"* + +This is a pure graph operation — it follows the commit ancestry tree, not the clock. +The result is exactly the set of commits unique to the current release, regardless of +when they were authored or which branch they live on. + +### Step 2 — PRs derived from commit messages, not from a time filter + +Rather than fetching all closed PRs and filtering by timestamp, compare mode reads the +PR numbers directly from the commit messages returned in Step 1. Both common merge +styles are recognised: + +- **Squash-merge:** `Fix new service access role (#1363)` +- **Merge-commit:** `Merge pull request #1363 from org/branch` + +Each unique PR number is then fetched individually by number. This means only the PRs +that actually belong to the release are ever loaded. + +Cherry-picks are handled automatically: the commit message on the maintenance branch +preserves the original PR number, so the right PR is always found even though the +commit SHA differs from the one on develop. + +### Step 3 — Why a PR can have a date before `data.since` + +When a commit is cherry-picked, the PR object that gets fetched is the *original* PR — +the one that was merged onto develop weeks or months earlier. Its `merged_at` date is +that old develop date, which is before the previous maintenance tag's timestamp +(`data.since`). + +Timestamp mode would silently drop it. Compare mode keeps it, because the commit graph +(not the clock) is the authority on what belongs in the release. + +### Step 4 — `data.since` is still set, but only used for issues + +`data.since` is always derived from the previous release's timestamp, in both modes. +In compare mode it is **not used to filter PRs or commits** — that job is already done +by the graph in Step 1. It is only used for: + +- Fetching recently-updated **issues** (issue filtering is timestamp-based in both modes) +- Date-gating **release notes extraction** from PR/issue body text + +### Step 5 — The filter stage passes PRs and commits through unchanged + +`FilterByRelease` — the stage that normally drops PRs and commits older than +`data.since` — detects that compare mode is active and skips that timestamp check +entirely. The PR and commit sets arriving from `mine_data` are already exact; no further +trimming is needed or correct. + +Issues are always filtered by timestamp regardless of mode. + +--- + +## Data Flow + +``` +from-tag-name provided? + │ + ┌──┴──────────────────────┐ + YES (compare mode) NO (timestamp mode) + │ │ + GitHub Compare API: get_commits(since=data.since) + commits unique to to-tag get_pulls(state=closed) + │ │ + extract PR numbers FilterByRelease drops + from commit messages PRs/commits before since + │ + fetch each PR by number + │ + FilterByRelease: skip timestamp check — pass everything through +``` + +--- + +## Configuration + +```yaml +- name: Generate Release Notes + uses: AbsaOSS/generate-release-notes@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag-name: "v2.6.5" # the release being generated + from-tag-name: "v2.6.4" # providing this activates compare mode + chapters: | + - {"title": "Bugfixes 🛠", "label": "bug"} + - {"title": "Features 🎉", "label": "feature"} +``` + +> **When to use:** always supply `from-tag-name` when releasing from a maintenance branch +> that runs in parallel with a development branch. Omitting it is fine for purely +> linear release histories. + +--- + +## Related Features + +- [Tag Range Selection](./tag_range.md) – explains the user-facing `from-tag-name` input and + its interaction with compare mode. +- [Date Selection](./date_selection.md) – controls whether `created_at` or `published_at` + is used as `data.since` (applies in both modes). +- [Release Notes Extraction](./release_notes_extraction.md) – uses `data.since` for body + scanning; unaffected by compare mode. + +← [Back to Feature Tutorials](../../README.md#feature-tutorials) diff --git a/docs/features/tag_range.md b/docs/features/tag_range.md index ab645149..64e310ed 100644 --- a/docs/features/tag_range.md +++ b/docs/features/tag_range.md @@ -38,6 +38,7 @@ https://github.com/org/repo/compare/v1.5.0...v1.6.0 (The compare URL reflects both `from-tag-name` and `tag-name`.) ## Related Features +- [Compare Mode](./compare_mode.md) – activated automatically when `from-tag-name` is set; explains why graph-based selection is needed for branching histories and how it works internally. - [Date Selection](./date_selection.md) – defines which timestamp of the previous release becomes the cutoff. - [Service Chapters](./service_chapters.md) – uses the same time window to assess gaps. - [Release Notes Extraction](./release_notes_extraction.md) – only processes PRs/issues within the computed window. diff --git a/release_notes_generator/data/SPEC.md b/release_notes_generator/data/SPEC.md new file mode 100644 index 00000000..df243ff8 --- /dev/null +++ b/release_notes_generator/data/SPEC.md @@ -0,0 +1,257 @@ +# SPEC — Compare-mode data mining + +## Problem + +The current `mine_data()` method derives a `since` timestamp from the previous release and fetches all +commits/PRs/issues in that time window. This breaks for branching release histories (e.g. maintenance +`v2.6.x` alongside a `develop`-based `v2.7.x`) because commits from the other branch fall inside the +same window. + +### Concrete example + +Full commit graph (abbreviated): + +``` +develop branch: +* (tag: v2.7.1) 2026-05-20 Improve Kafka consumer throughput (#1401) +* (tag: v2.7.0) 2026-05-14 Fix new service access role (#1363) +* 2026-05-07 Fix/1346 custom hive table (#1349) +* 2026-05-07 Collect latest OpenAPI changes (#1330) +| maintenance/v2.6.x branch: +| * (tag: v2.6.5) 2026-05-20 Backport: handle empty schema in Hive (#1402) +| * (tag: v2.6.4) 2026-05-14 Fix new service access role (#1363) ← cherry-pick +| * (tag: v2.6.3) 2026-05-07 Fix/1346 custom hive table (#1349) ← cherry-pick +|/ +* (tag: v2.6.0) 2026-04-21 Fixes for update-ca-certificates (#1318) +``` + +#### Case 1 — `v2.6.4` (released same day as `v2.7.0`) + +Generating release notes for `v2.6.4` (previous: `v2.6.3`): + +- **Timestamp mode (current):** `since = published_at(v2.6.3) = 2026-05-07`. + `repo.get_commits(since=…)` returns every commit authored between 2026-05-07 and 2026-05-14 + on **any branch** — including `#1349` and `#1330` from develop. Those belong to `v2.7.0`, not + `v2.6.4`, yet they appear in the release notes output. + +- **Compare mode (new):** `repo.compare("v2.6.3", "v2.6.4")` returns only the commits reachable + from `v2.6.4` but **not** from `v2.6.3` — exactly `#1363`. That is the correct set. + +#### Case 2 — `v2.6.5` (released after `v2.7.0` and `v2.7.1`) + +This case is even more broken with the timestamp approach. + +Generating release notes for `v2.6.5` (previous: `v2.6.4`): + +- **Timestamp mode (current):** `since = published_at(v2.6.4) = 2026-05-14`. + `repo.get_commits(since=…)` returns everything authored between 2026-05-14 and 2026-05-20 on + **any branch** — that includes `#1363` (v2.7.0) and `#1401` (v2.7.1) from develop. Neither of + those belongs in the `v2.6.5` patch release notes, but both appear in the output. As more `v2.7.x` + releases accumulate, the noise grows linearly. + +- **Compare mode (new):** `repo.compare("v2.6.4", "v2.6.5")` returns only `#1402` — the single + backport commit unique to `v2.6.5`. That is the correct set, regardless of how many `v2.7.x` + commits exist in the same timestamp window. + +The equivalent Git command the user already uses manually: +``` +git log --oneline --right-only --cherry-pick v2.6.4...v2.6.5 +``` + +--- + +## Solution + +### When compare mode activates + +Compare mode is used when the `from-tag-name` action input is explicitly provided. When it is absent, +the existing timestamp-based path runs unchanged. + +### PyGitHub API used + +`repo.compare(base: str, head: str) → Comparison` + +- Calls `GET /repos/{owner}/{repo}/compare/{base}...{head}` (three-dot diff — commits in `head` not reachable from `base`). +- Available in PyGitHub ≥ 2.x; confirmed present in the project's locked version (`PyGithub==2.9.1`). +- Returns a `github.Comparison.Comparison` object with: + - `commits: PaginatedList[github.Commit.Commit]` — up to 10,000 commits, fully iterable; same `Commit` type already used throughout the codebase. + - `files: list[File]` — first 300 changed files only (not used here). +- The `Commit` objects have the same shape as those returned by `repo.get_commits()`, so no type conversion is needed downstream. + +`repo.get_pull(number: int) → PullRequest` + +- Calls `GET /repos/{owner}/{repo}/pulls/{pull_number}`. +- Returns a full `github.PullRequest.PullRequest` object — same type as those in `data.pull_requests` today. + +### What changes in `DataMiner.mine_data()` + +**Timestamp mode (unchanged path):** +1. Derive `data.since` from the previous release timestamp. +2. `repo.get_commits(since=data.since)` — all commits authored after that timestamp on the default branch. +3. `repo.get_pulls(state="closed", base=default_branch)` — all closed PRs; `FilterByRelease` trims them by `merged_at >= data.since`. +4. `repo.get_issues(since=data.since)` — all issues updated after that timestamp. + +**Compare mode (new path):** +1. Derive `data.since` from the previous release timestamp (same as before — used by `_get_issues` for + unrelated issue fetching, kept for consistency with the rest of the pipeline). +2. `comparison = repo.compare(from_tag, to_tag)` — returns only commits reachable from `to_tag` but + not from `from_tag`. Same semantics as `git log --right-only --cherry-pick from_tag...to_tag`. +3. Iterate `comparison.commits` to collect the commit objects and their SHAs. +4. Extract PR numbers from the commit messages (helper `_extract_pr_numbers_from_commits`): + - Pattern A (squash-merge): `(#NNN)` anywhere in the message → PR number `NNN`. + - Pattern B (merge-commit): `Merge pull request #NNN from …` → PR number `NNN`. + - Both patterns handled by a single regex `\(#(\d+)\)|Merge pull request #(\d+)`. + - Cherry-picks are handled automatically: the commit message on the maintenance branch is + identical to the one on develop, so the PR number is always present regardless of which SHA + was used for the cherry-pick. +5. For each extracted PR number, call `repo.get_pull(number)` to retrieve the full PR object. + `None` results (deleted/missing PRs) are silently skipped. +6. `data.pull_requests` is populated with only those PRs; `data.issues` is left as `{}`. +7. `data.compare_commit_shas` is set to the set of SHAs from step 3. + +### What changes in `FilterByRelease` + +`FilterByRelease` currently trims commits and PRs by timestamp. In compare mode the commit/PR set is +already exact, so timestamp trimming must be skipped. + +Guard: if `data.compare_commit_shas` is non-empty, copy commits and PRs through unchanged; otherwise +run the existing timestamp filter logic. + +### New field on `MinedData` + +`compare_commit_shas: set[str]` — empty by default; set to the SHA strings of all commits returned by +`repo.compare(…)` in compare mode. Acts as the sentinel that tells `FilterByRelease` which path to take. + +### What does NOT change + +- Issue fetching and filtering logic is untouched. +- `data.since` is still derived from the previous release and propagated everywhere that uses it + (e.g. `custom_chapters.since` for release-note extraction date-gating). +- The semantic-version ordering path for finding the previous release is untouched. +- All downstream record building (factory, chapters, builder) is unaffected. + +--- + +## Files changed + +- `release_notes_generator/model/mined_data.py` — add `compare_commit_shas: set[str]` field +- `release_notes_generator/data/miner.py` — add compare-mode branch in `mine_data()`; add helper `_extract_pr_numbers_from_commits()` +- `release_notes_generator/data/filter.py` — skip timestamp filter for PRs and commits when `compare_commit_shas` is non-empty + +--- + +## Test cases (confirmed green ✅) + +All 22 tests implemented and passing (`tests/unit/release_notes_generator/data/`). + +### Helper: `DataMiner._extract_pr_numbers_from_commits` — `test_miner.py` + +| # | Name | Intent | Commit messages | Expected `set[int]` | +|---|------|--------|-----------------|---------------------| +| 1 | `test_extract_pr_numbers_squash_format` | Squash-merge `(#NNN)` tail | `["Fix service access role (#1363)"]` | `{1363}` | +| 2 | `test_extract_pr_numbers_merge_format` | Merge-commit `Merge pull request #NNN` header | `["Merge pull request #1358 from org/branch"]` | `{1358}` | +| 3 | `test_extract_pr_numbers_multiple_commits` | All PR numbers from multiple commits | 3 commits: `"…(#10)"`, `"Merge pull request #20 from …"`, `"…(#30)"` | `{10, 20, 30}` | +| 4 | `test_extract_pr_numbers_deduplicates` | Same PR referenced in two commits is returned once | `["Fix A (#42)", "Follow-up for #42 (#42)"]` | `{42}` | +| 5 | `test_extract_pr_numbers_no_references` | No `#NNN` patterns → empty set | `["Initial commit", "Fix typo in README"]` | `set()` | +| 6 | `test_extract_pr_numbers_empty_input` | Empty commit list → empty set | `[]` | `set()` | +| 7 | `test_extract_pr_numbers_multiline_message` | PR number in second line of message | `["Subject line\n\nFixes behaviour (#77)"]` | `{77}` | + +### `DataMiner.mine_data()` — compare mode — `test_miner.py` + +| # | Name | Intent | Setup | Expected | +|---|------|--------|-------|----------| +| 8 | `test_mine_data_compare_mode_uses_compare_api` | `repo.compare` is called (not `get_commits`) | `is_from_tag_name_defined=True`, from-tag=`v2.6.3`, to-tag=`v2.6.4`; compare returns 1 commit `#1363` | `repo.compare` called once; `repo.get_commits` NOT called; `data.compare_commit_shas == {sha_of_1363}` | +| 9 | `test_mine_data_compare_mode_fetches_prs_by_number` | PRs fetched individually by number from message | Compare commit message `"Fix (#42)"`; `repo.get_pull(42)` returns mock PR | `data.pull_requests == {mock_pr: repo}` | +| 10 | `test_mine_data_compare_mode_multiple_prs` | Two commits → two distinct PRs fetched | Messages `"A (#10)"` and `"B (#20)"`; both `get_pull` calls return valid PRs | `len(data.pull_requests) == 2` | +| 11 | `test_mine_data_compare_mode_leaves_issues_empty` | `data.issues` is `{}` in compare mode | Any compare setup | `data.issues == {}` | +| 12 | `test_mine_data_compare_mode_sets_data_since` | `data.since` still derived from release `created_at` | Release with `created_at=2026-05-07`; `published-at=false` | `data.since == datetime(2026, 5, 7)` | +| 13 | `test_mine_data_compare_mode_sets_data_since_published_at` | `data.since` uses `published_at` when flag set | Release with `published_at=2026-05-08`; `published-at=true` | `data.since == datetime(2026, 5, 8)` | +| 14 | `test_mine_data_compare_mode_skips_none_prs` | `get_pull` returning `None` silently omitted | Compare commit refs PR #99; `get_pull(99)` returns None | `data.pull_requests == {}` | +| 15 | `test_mine_data_compare_mode_no_pr_numbers_in_message` | Commit with no PR ref produces no PRs | Commit message `"Bump version to 2.6.4"` | `data.pull_requests == {}`; `data.compare_commit_shas` non-empty | + +### `DataMiner.mine_data()` — timestamp mode (regression) — `test_miner.py` + +| # | Name | Intent | Setup | Expected | +|---|------|--------|-------|----------| +| 16 | `test_mine_data_timestamp_mode_uses_get_commits` | Timestamp path calls `get_commits(since=…)` not `compare` | `is_from_tag_name_defined=False`; `data.since` set | `repo.get_commits` called with `since=data.since`; `repo.compare` NOT called | +| 17 | `test_mine_data_timestamp_mode_compare_shas_empty` | `compare_commit_shas` is empty set in timestamp mode | `is_from_tag_name_defined=False` | `data.compare_commit_shas == set()` | + +### `FilterByRelease` — compare mode guard — `test_filter.py` + +| # | Name | Intent | Setup | Expected | +|---|------|--------|-------|----------| +| 18 | `test_filter_compare_mode_passes_prs_through` | PRs kept regardless of `merged_at` when in compare mode | `compare_commit_shas` non-empty; PR `merged_at` is 30 days before `data.since` | PR present in `filtered.pull_requests` | +| 19 | `test_filter_compare_mode_passes_commits_through` | Commits kept regardless of author date when in compare mode | `compare_commit_shas` non-empty; commit author date is 30 days before `data.since` | Commit present in `filtered.commits` | +| 20 | `test_filter_compare_mode_passes_multiple_prs_and_commits` | All PRs and commits pass through unfiltered | `compare_commit_shas` non-empty; 3 PRs and 2 commits all dated before `data.since` | All 3 PRs and 2 commits in filtered result | +| 21 | `test_filter_timestamp_mode_filters_old_prs` | Timestamp mode: PR before `since` is excluded | `compare_commit_shas == set()`; PR `merged_at` before `data.since` | PR absent from `filtered.pull_requests` | +| 22 | `test_filter_timestamp_mode_keeps_recent_prs` | Timestamp mode: PR after `since` is kept | `compare_commit_shas == set()`; PR `merged_at` after `data.since` | PR present in `filtered.pull_requests` (regression guard) | + +--- + +## Proposed documentation updates + +These changes should be applied to the user-facing docs once the feature ships. + +### `docs/features/tag_range.md` + +Replace the **How It Works** section with: + +```markdown +## How It Works + +### Timestamp mode (default when `from-tag-name` is omitted) +- The previous release's timestamp is used as the `since` boundary. +- Commits, PRs, and issues updated after that timestamp are fetched. +- **Limitation:** breaks for branching release histories — commits from other active branches + (e.g. `v2.7.x`) fall inside the same timestamp window and appear in the patch release notes. + +### Compare mode (activated when `from-tag-name` is explicitly provided) +- Uses the GitHub Compare API (`GET /repos/{owner}/{repo}/compare/{from-tag}...{to-tag}`). +- Returns only commits reachable from `tag-name` but **not** from `from-tag-name`, regardless + of timestamp or branch. +- PR numbers are extracted from those commit messages; only the matching PRs are fetched. +- Cherry-picks are handled automatically: the commit message on the maintenance branch preserves + the original PR reference. +- Equivalent: `git log --oneline --right-only --cherry-pick ...` + +**Example** — `v2.6.5` (previous: `v2.6.4`): + +​``` +develop: +* (tag: v2.7.1) 2026-05-20 Improve Kafka consumer throughput (#1401) +* (tag: v2.7.0) 2026-05-14 Fix new service access role (#1363) +maintenance/v2.6.x: +| * (tag: v2.6.5) 2026-05-20 Backport: handle empty schema in Hive (#1402) +| * (tag: v2.6.4) 2026-05-14 Fix new service access role (#1363) ← cherry-pick +|/ +* (tag: v2.6.0) 2026-04-21 … +​``` + +- **Timestamp mode** fetches commits between 2026-05-14 and 2026-05-20 on any branch → captures + `#1363` and `#1401` from develop. Incorrect. +- **Compare mode** (`repo.compare("v2.6.4", "v2.6.5")`) returns only `#1402`. Correct. + +### Other behaviour (both modes) +- Tag normalization: `1.6.0` and `v1.6.0` are both accepted. +- `data.since` is still derived from the previous release in compare mode (used for issue fetching + and release-note extraction date-gating). +- If no prior release exists (first release), all issues/PRs are included. +``` + +Also update the **Configuration** example to show `from-tag-name: "v2.6.4"` and the +**Example Result** to show only `#1402`. + +--- + +### `docs/configuration_reference.md` + +Update the `from-tag-name` row description from: + +> `Explicit previous release tag; if empty semantic latest published release is used.` + +To: + +> `Explicit previous release tag. When provided, activates compare mode (graph-based commit +> selection via GitHub Compare API), which is correct for branching release histories +> (e.g. maintenance v2.6.x alongside v2.7.x). If empty, semantic latest published release is +> used with timestamp-based filtering. See [Tag Range](../docs/features/tag_range.md).` diff --git a/release_notes_generator/data/filter.py b/release_notes_generator/data/filter.py index cc827df6..52852e5d 100644 --- a/release_notes_generator/data/filter.py +++ b/release_notes_generator/data/filter.py @@ -68,6 +68,7 @@ def filter(self, data: MinedData) -> MinedData: md = MinedData(data.home_repository) md.release = data.release md.since = data.since + md.compare_commit_shas = data.compare_commit_shas if data.release is not None: logger.info("Starting issue, prs and commit reduction by the latest release since time.") @@ -75,27 +76,37 @@ def filter(self, data: MinedData) -> MinedData: issues_dict = self._filter_issues(data) logger.debug("Count of issues reduced from %d to %d", len(data.issues), len(issues_dict)) - # filter out merged PRs and commits before the date - pulls_seen: set[int] = set() - pulls_dict: dict[PullRequest, Repository] = {} - for pull, repo in data.pull_requests.items(): - if data.since and ( - (pull.merged_at and pull.merged_at >= data.since) - or (pull.closed_at and pull.closed_at >= data.since) - ): - if pull.number not in pulls_seen: - pulls_seen.add(pull.number) - pulls_dict[pull] = repo - logger.debug( - "Count of pulls reduced from %d to %d", len(data.pull_requests.items()), len(pulls_dict.items()) - ) - - commits_dict = { - commit: repo - for commit, repo in data.commits.items() - if data.since and commit.commit.author.date > data.since - } - logger.debug("Count of commits reduced from %d to %d", len(data.commits.items()), len(commits_dict.items())) + if data.compare_commit_shas: + # compare mode: PR and commit sets are already exact — pass through unchanged + pulls_dict = dict(data.pull_requests) + commits_dict = dict(data.commits) + logger.debug("Compare mode: skipping PR/commit timestamp filter.") + else: + # filter out merged PRs and commits before the date + pulls_seen: set[int] = set() + pulls_dict = {} + for pull, repo in data.pull_requests.items(): + if data.since and ( + (pull.merged_at and pull.merged_at >= data.since) + or (pull.closed_at and pull.closed_at >= data.since) + ): + if pull.number not in pulls_seen: + pulls_seen.add(pull.number) + pulls_dict[pull] = repo + logger.debug( + "Count of pulls reduced from %d to %d", + len(data.pull_requests.items()), len(pulls_dict.items()), + ) + + commits_dict = { + commit: repo + for commit, repo in data.commits.items() + if data.since and commit.commit.author.date > data.since + } + logger.debug( + "Count of commits reduced from %d to %d", + len(data.commits.items()), len(commits_dict.items()), + ) md.issues = issues_dict md.pull_requests = pulls_dict diff --git a/release_notes_generator/data/miner.py b/release_notes_generator/data/miner.py index 3161f527..e086cd19 100644 --- a/release_notes_generator/data/miner.py +++ b/release_notes_generator/data/miner.py @@ -19,6 +19,7 @@ """ import logging +import re import sys import traceback from concurrent.futures import ThreadPoolExecutor, as_completed, CancelledError @@ -33,6 +34,9 @@ from release_notes_generator.action_inputs import ActionInputs from release_notes_generator.data.utils.bulk_sub_issue_collector import BulkSubIssueCollector +from github.Commit import Commit as GithubCommit + +_PR_NUMBER_RE = re.compile(r"\(#(\d+)\)|Merge pull request #(\d+)") from release_notes_generator.model.record.issue_record import IssueRecord from release_notes_generator.model.mined_data import MinedData from release_notes_generator.model.record.pull_request_record import PullRequestRecord @@ -66,16 +70,35 @@ def mine_data(self) -> MinedData: self._get_issues(data) - # pulls and commits, and then reduce them by the latest release since time - pull_requests = list( - self._safe_call(repo.get_pulls)(state=PullRequestRecord.PR_STATE_CLOSED, base=repo.default_branch) - ) - data.pull_requests = {pr: data.home_repository for pr in pull_requests} - if data.since: - commits = list(self._safe_call(repo.get_commits)(since=data.since)) + if ActionInputs.is_from_tag_name_defined(): + logger.info("Compare mode: using repo.compare('%s', '%s').", + ActionInputs.get_from_tag_name(), ActionInputs.get_tag_name()) + comparison = self._safe_call(repo.compare)( + ActionInputs.get_from_tag_name(), ActionInputs.get_tag_name() + ) + compare_commits: list[GithubCommit] = list(comparison.commits) + data.compare_commit_shas = {c.sha for c in compare_commits} + data.commits = {c: data.home_repository for c in compare_commits} + pr_numbers = self._extract_pr_numbers_from_commits(compare_commits) + pulls: dict[PullRequest, Repository] = {} + for number in pr_numbers: + pr = self._safe_call(repo.get_pull)(number) + if pr is not None: + pulls[pr] = data.home_repository + data.pull_requests = pulls + logger.info("Compare mode: found %d commit(s), %d PR(s).", + len(compare_commits), len(data.pull_requests)) else: - commits = list(self._safe_call(repo.get_commits)()) - data.commits = {c: data.home_repository for c in commits} + # pulls and commits, then reduce them by the latest release since time + pull_requests = list( + self._safe_call(repo.get_pulls)(state=PullRequestRecord.PR_STATE_CLOSED, base=repo.default_branch) + ) + data.pull_requests = {pr: data.home_repository for pr in pull_requests} + if data.since: + commits = list(self._safe_call(repo.get_commits)(since=data.since)) + else: + commits = list(self._safe_call(repo.get_commits)()) + data.commits = {c: data.home_repository for c in commits} logger.info("Initial data mining from GitHub completed.") @@ -423,6 +446,24 @@ def __get_latest_semantic_release(releases) -> Optional[GitRelease]: return rls + @staticmethod + def _extract_pr_numbers_from_commits(commits: list[GithubCommit]) -> set[int]: + """ + Extract unique PR numbers from commit messages. + + Parameters: + commits: Commit objects whose messages are scanned. + Returns: + set[int]: Unique PR numbers found across all messages. + """ + pr_numbers: set[int] = set() + for commit in commits: + message = commit.commit.message + for match in _PR_NUMBER_RE.finditer(message): + number_str = match.group(1) or match.group(2) + pr_numbers.add(int(number_str)) + return pr_numbers + @staticmethod def __filter_duplicated_issues(data: MinedData) -> "MinedData": """ diff --git a/release_notes_generator/model/mined_data.py b/release_notes_generator/model/mined_data.py index 4f34969a..49a9eede 100644 --- a/release_notes_generator/model/mined_data.py +++ b/release_notes_generator/model/mined_data.py @@ -47,6 +47,7 @@ def __init__(self, repository: Repository): self.issues: dict[Issue, Repository] = {} self.pull_requests: dict[PullRequest, Repository] = {} self.commits: dict[Commit, Repository] = {} + self.compare_commit_shas: set[str] = set() self.parents_sub_issues: dict[str, list[str]] = {} # parent issue id -> list of its sub-issues ids # dictionary of fetched cross issues and their pull requests diff --git a/tests/integration/test_compare_mode.py b/tests/integration/test_compare_mode.py new file mode 100644 index 00000000..4a43768d --- /dev/null +++ b/tests/integration/test_compare_mode.py @@ -0,0 +1,292 @@ +# +# Copyright 2023 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Offline integration tests — compare mode (graph-based commit selection). + +Covers the end-to-end behaviour difference between timestamp mode and compare mode +through the full pipeline (main.run → mine_data → filter → build → output). + +The key observable difference: + - Timestamp mode: PRs/commits whose merged_at/author.date pre-dates data.since are + silently dropped by FilterByRelease. + - Compare mode: PRs/commits in data.pull_requests/data.commits pass through + unchanged — FilterByRelease treats the set as already exact. + +In a real branching history this means maintenance-branch cherry-picks (whose +merge date may pre-date the previous tag on the develop branch) are always +included in the compare-mode output. +""" + +from collections.abc import Callable +from datetime import datetime, timedelta + +from pytest_mock import MockerFixture + +from github.Commit import Commit +from github.GitRelease import GitRelease +from github.Issue import Issue +from github.PullRequest import PullRequest +from github.Repository import Repository + +from release_notes_generator.data.miner import DataMiner +from tests.integration.helpers import build_mined_data, capture_run + + +# --------------------------------------------------------------------------- +# Compare mode: PRs dated before data.since still appear in the output +# --------------------------------------------------------------------------- + + +def test_compare_mode_includes_pr_dated_before_since( + mocker: MockerFixture, + patch_env: Callable, + mock_github: object, + make_issue: Callable[..., Issue], + make_pr: Callable[..., PullRequest], + make_repo: Callable[..., Repository], + make_release: Callable[..., GitRelease], +) -> None: + """In compare mode a PR merged before data.since is retained in the output. + + Simulates a cherry-pick commit on a maintenance branch: the PR was merged + before the previous-tag timestamp but belongs in the release because the + compare API returned its commit. + """ + repo = make_repo("org/repo") + release = make_release("v2.6.3") + + since = datetime(2026, 5, 7) + # PR merged 30 days before since — timestamp mode would drop it + pr_cherry_pick = make_pr( + 1363, + title="Fix new service access role", + body="", + user_login="dev1", + merged=True, + ) + pr_cherry_pick.merged_at = since - timedelta(days=30) + pr_cherry_pick.closed_at = pr_cherry_pick.merged_at + pr_cherry_pick.merge_commit_sha = "abc1363" + + data = build_mined_data( + repo=repo, + issues=[], + pull_requests=[(pr_cherry_pick, repo)], + commits=[], + release=release, + since=since, + ) + # compare mode sentinel + data.compare_commit_shas = {"abc1363"} + + mocker.patch.object(DataMiner, "check_repository_exists", return_value=True) + mocker.patch.object(DataMiner, "mine_data", return_value=data) + mocker.patch( + "release_notes_generator.record.factory.default_record_factory.get_issues_for_pr", + return_value=set(), + ) + + actual = capture_run( + patch_env, + { + "INPUT_TAG_NAME": "v2.6.4", + "INPUT_FROM_TAG_NAME": "v2.6.3", + }, + ) + + assert "Fix new service access role" in actual, ( + "Compare-mode PR dated before data.since must appear in the release notes" + ) + + +# --------------------------------------------------------------------------- +# Timestamp mode: same PR is excluded when merged before data.since +# --------------------------------------------------------------------------- + + +def test_timestamp_mode_excludes_pr_dated_before_since( + mocker: MockerFixture, + patch_env: Callable, + mock_github: object, + make_issue: Callable[..., Issue], + make_pr: Callable[..., PullRequest], + make_repo: Callable[..., Repository], + make_release: Callable[..., GitRelease], +) -> None: + """In timestamp mode a PR merged before data.since is excluded. + + This is the control case: identical setup to the compare-mode test but + without compare_commit_shas populated, so FilterByRelease applies + timestamp filtering and drops the old PR. + """ + repo = make_repo("org/repo") + release = make_release("v2.6.3") + + since = datetime(2026, 5, 7) + pr_old = make_pr( + 1363, + title="Fix new service access role", + body="", + user_login="dev1", + merged=True, + ) + pr_old.merged_at = since - timedelta(days=30) + pr_old.closed_at = pr_old.merged_at + + data = build_mined_data( + repo=repo, + issues=[], + pull_requests=[(pr_old, repo)], + commits=[], + release=release, + since=since, + ) + # compare_commit_shas is empty → timestamp mode (default from build_mined_data) + + mocker.patch.object(DataMiner, "check_repository_exists", return_value=True) + mocker.patch.object(DataMiner, "mine_data", return_value=data) + mocker.patch( + "release_notes_generator.record.factory.default_record_factory.get_issues_for_pr", + return_value=set(), + ) + + actual = capture_run( + patch_env, + { + "INPUT_TAG_NAME": "v2.6.4", + "INPUT_FROM_TAG_NAME": "", + }, + ) + + assert "Fix new service access role" not in actual, ( + "Timestamp-mode PR dated before data.since must be absent from the release notes" + ) + + +# --------------------------------------------------------------------------- +# Compare mode: multiple PRs all retained, none filtered by timestamp +# --------------------------------------------------------------------------- + + +def test_compare_mode_retains_all_prs_regardless_of_date( + mocker: MockerFixture, + patch_env: Callable, + mock_github: object, + make_issue: Callable[..., Issue], + make_pr: Callable[..., PullRequest], + make_repo: Callable[..., Repository], + make_release: Callable[..., GitRelease], +) -> None: + """All compare-mode PRs appear even when all predate data.since. + + Represents the v2.6.5 scenario: the backport PR and any other PRs + unique to the maintenance branch all belong in the release notes. + """ + repo = make_repo("org/repo") + release = make_release("v2.6.4") + + since = datetime(2026, 5, 14) + old_date = since - timedelta(days=60) + + pr_a = make_pr(1402, title="Backport: handle empty schema in Hive", body="", user_login="dev1", merged=True) + pr_a.merged_at = old_date + pr_a.closed_at = old_date + pr_a.merge_commit_sha = "sha1402" + + pr_b = make_pr(1350, title="Backport: fix config loader", body="", user_login="dev2", merged=True) + pr_b.merged_at = old_date + pr_b.closed_at = old_date + pr_b.merge_commit_sha = "sha1350" + + data = build_mined_data( + repo=repo, + issues=[], + pull_requests=[(pr_a, repo), (pr_b, repo)], + commits=[], + release=release, + since=since, + ) + data.compare_commit_shas = {"sha1402", "sha1350"} + + mocker.patch.object(DataMiner, "check_repository_exists", return_value=True) + mocker.patch.object(DataMiner, "mine_data", return_value=data) + mocker.patch( + "release_notes_generator.record.factory.default_record_factory.get_issues_for_pr", + return_value=set(), + ) + + actual = capture_run( + patch_env, + { + "INPUT_TAG_NAME": "v2.6.5", + "INPUT_FROM_TAG_NAME": "v2.6.4", + }, + ) + + assert "Backport: handle empty schema in Hive" in actual + assert "Backport: fix config loader" in actual + + +# --------------------------------------------------------------------------- +# Compare mode: commits dated before data.since are included in service chapters +# --------------------------------------------------------------------------- + + +def test_compare_mode_includes_commit_dated_before_since( + mocker: MockerFixture, + patch_env: Callable, + mock_github: object, + make_commit: Callable[..., Commit], + make_repo: Callable[..., Repository], + make_release: Callable[..., GitRelease], +) -> None: + """A direct commit pre-dating data.since appears in the service chapter in compare mode.""" + repo = make_repo("org/repo") + release = make_release("v2.6.3") + + since = datetime(2026, 5, 7) + + commit = make_commit("bumpsha1234", "Bump version to 2.6.4") + commit.commit.author.date = since - timedelta(days=10) + + data = build_mined_data( + repo=repo, + issues=[], + pull_requests=[], + commits=[(commit, repo)], + release=release, + since=since, + ) + data.compare_commit_shas = {"bumpsha1234"} + + mocker.patch.object(DataMiner, "check_repository_exists", return_value=True) + mocker.patch.object(DataMiner, "mine_data", return_value=data) + mocker.patch( + "release_notes_generator.record.factory.default_record_factory.get_issues_for_pr", + return_value=set(), + ) + + actual = capture_run( + patch_env, + { + "INPUT_TAG_NAME": "v2.6.4", + "INPUT_FROM_TAG_NAME": "v2.6.3", + "INPUT_WARNINGS": "true", + }, + ) + + assert "Bump version to 2.6.4" in actual, ( + "Compare-mode commit pre-dating data.since must appear in service chapter" + ) diff --git a/tests/unit/release_notes_generator/data/test_filter.py b/tests/unit/release_notes_generator/data/test_filter.py index 75cb5c85..bc3f78cc 100644 --- a/tests/unit/release_notes_generator/data/test_filter.py +++ b/tests/unit/release_notes_generator/data/test_filter.py @@ -15,7 +15,7 @@ from unittest.mock import MagicMock -from datetime import datetime +from datetime import datetime, timedelta from github.Repository import Repository @@ -32,6 +32,7 @@ def test_filter_no_release(mocker): data.home_repository = MagicMock(spec=Repository) data.since = datetime(2023, 1, 1) data.release = None + data.compare_commit_shas = set() data.issues = [MagicMock(closed_at=None), MagicMock(closed_at=None)] data.pull_requests = [MagicMock(merged_at=None), MagicMock(merged_at=None)] data.commits = [MagicMock(commit=MagicMock(author=MagicMock(date=None)))] @@ -57,6 +58,7 @@ def test_filter_with_release(mocker): data.home_repository = MagicMock(spec=Repository) data.release = MagicMock() data.since = datetime(2023, 1, 1) + data.compare_commit_shas = set() # Mock issues, pull requests, and commits data.issues = { @@ -89,3 +91,108 @@ def test_filter_with_release(mocker): assert ("Count of issues reduced from %d to %d", 2, 1) == mock_log_debug.call_args_list[1][0] assert ("Count of pulls reduced from %d to %d", 2, 1) == mock_log_debug.call_args_list[2][0] assert ("Count of commits reduced from %d to %d", 2, 1) == mock_log_debug.call_args_list[3][0] + + +# --- FilterByRelease compare mode guard --- + + +def test_filter_compare_mode_passes_prs_through(): + """PRs retained regardless of merged_at when compare_commit_shas is non-empty.""" + data = MagicMock(spec=MinedData) + data.home_repository = MagicMock(spec=Repository) + data.release = MagicMock() + data.since = datetime(2026, 5, 14) + data.compare_commit_shas = {"sha_abc"} + data.issues = {} + data.commits = {} + old_pr = MagicMock() + old_pr.number = 1 + old_pr.merged_at = datetime(2026, 5, 14) - timedelta(days=30) + old_pr.closed_at = old_pr.merged_at + data.pull_requests = {old_pr: data.home_repository} + + filtered = FilterByRelease().filter(data) + + assert old_pr in filtered.pull_requests + + +def test_filter_compare_mode_passes_commits_through(): + """Commits retained regardless of author date when compare_commit_shas is non-empty.""" + data = MagicMock(spec=MinedData) + data.home_repository = MagicMock(spec=Repository) + data.release = MagicMock() + data.since = datetime(2026, 5, 14) + data.compare_commit_shas = {"sha_abc"} + data.issues = {} + data.pull_requests = {} + old_commit = MagicMock() + old_commit.commit.author.date = datetime(2026, 5, 14) - timedelta(days=30) + data.commits = {old_commit: data.home_repository} + + filtered = FilterByRelease().filter(data) + + assert old_commit in filtered.commits + + +def test_filter_compare_mode_passes_multiple_prs_and_commits(): + """All PRs and commits pass through unfiltered in compare mode.""" + data = MagicMock(spec=MinedData) + data.home_repository = MagicMock(spec=Repository) + data.release = MagicMock() + data.since = datetime(2026, 5, 14) + data.compare_commit_shas = {"sha1", "sha2"} + data.issues = {} + old_date = datetime(2026, 5, 14) - timedelta(days=30) + data.pull_requests = { + MagicMock(number=i, merged_at=old_date, closed_at=old_date): data.home_repository + for i in range(3) + } + data.commits = { + MagicMock(commit=MagicMock(author=MagicMock(date=old_date))): data.home_repository + for _ in range(2) + } + + filtered = FilterByRelease().filter(data) + + assert len(filtered.pull_requests) == 3 + assert len(filtered.commits) == 2 + + +def test_filter_timestamp_mode_filters_old_prs(): + """PR before since excluded in timestamp mode (compare_commit_shas empty).""" + data = MagicMock(spec=MinedData) + data.home_repository = MagicMock(spec=Repository) + data.release = MagicMock() + data.since = datetime(2026, 5, 14) + data.compare_commit_shas = set() + data.issues = {} + data.commits = {} + old_pr = MagicMock() + old_pr.number = 1 + old_pr.merged_at = datetime(2026, 5, 14) - timedelta(days=30) + old_pr.closed_at = old_pr.merged_at + data.pull_requests = {old_pr: data.home_repository} + + filtered = FilterByRelease().filter(data) + + assert old_pr not in filtered.pull_requests + + +def test_filter_timestamp_mode_keeps_recent_prs(): + """PR after since retained in timestamp mode (regression guard).""" + data = MagicMock(spec=MinedData) + data.home_repository = MagicMock(spec=Repository) + data.release = MagicMock() + data.since = datetime(2026, 5, 14) + data.compare_commit_shas = set() + data.issues = {} + data.commits = {} + new_pr = MagicMock() + new_pr.number = 2 + new_pr.merged_at = datetime(2026, 5, 15) + new_pr.closed_at = datetime(2026, 5, 15) + data.pull_requests = {new_pr: data.home_repository} + + filtered = FilterByRelease().filter(data) + + assert new_pr in filtered.pull_requests diff --git a/tests/unit/release_notes_generator/data/test_miner.py b/tests/unit/release_notes_generator/data/test_miner.py index 3e8a666a..1c926b97 100644 --- a/tests/unit/release_notes_generator/data/test_miner.py +++ b/tests/unit/release_notes_generator/data/test_miner.py @@ -520,3 +520,220 @@ def test_fetch_prs_for_fetched_cross_issues(mocker, mock_repo): assert key_ok in result and result[key_ok] == [pr_obj] assert key_err in result and result[key_err] == [] warn_mock.assert_called_once() + + +# --- _extract_pr_numbers_from_commits --- + + +def test_extract_pr_numbers_squash_format(mocker): + commit = mocker.Mock() + commit.commit.message = "Fix service access role (#1363)" + assert DataMiner._extract_pr_numbers_from_commits([commit]) == {1363} + + +def test_extract_pr_numbers_merge_format(mocker): + commit = mocker.Mock() + commit.commit.message = "Merge pull request #1358 from org/branch" + assert DataMiner._extract_pr_numbers_from_commits([commit]) == {1358} + + +def test_extract_pr_numbers_multiple_commits(mocker): + c1, c2, c3 = mocker.Mock(), mocker.Mock(), mocker.Mock() + c1.commit.message = "Feature A (#10)" + c2.commit.message = "Merge pull request #20 from org/feat-b" + c3.commit.message = "Hotfix C (#30)" + assert DataMiner._extract_pr_numbers_from_commits([c1, c2, c3]) == {10, 20, 30} + + +def test_extract_pr_numbers_deduplicates(mocker): + c1, c2 = mocker.Mock(), mocker.Mock() + c1.commit.message = "Fix A (#42)" + c2.commit.message = "Follow-up for #42 (#42)" + assert DataMiner._extract_pr_numbers_from_commits([c1, c2]) == {42} + + +def test_extract_pr_numbers_no_references(mocker): + c1, c2 = mocker.Mock(), mocker.Mock() + c1.commit.message = "Initial commit" + c2.commit.message = "Fix typo in README" + assert DataMiner._extract_pr_numbers_from_commits([c1, c2]) == set() + + +def test_extract_pr_numbers_empty_input(): + assert DataMiner._extract_pr_numbers_from_commits([]) == set() + + +def test_extract_pr_numbers_multiline_message(mocker): + commit = mocker.Mock() + commit.commit.message = "Subject line\n\nFixes behaviour (#77)" + assert DataMiner._extract_pr_numbers_from_commits([commit]) == {77} + + +# --- mine_data compare mode --- + + +def _make_compare_miner(mocker, mock_repo, *, from_tag="v2.6.3", to_tag="v2.6.4", + created_at=datetime(2026, 5, 7), published_at=None, prefer_published=False, + compare_commits=None, get_pull_side_effect=None): + """Wire a DataMiner for compare-mode mine_data calls.""" + mocker.patch("release_notes_generator.action_inputs.ActionInputs.is_from_tag_name_defined", return_value=True) + mocker.patch("release_notes_generator.action_inputs.ActionInputs.get_from_tag_name", return_value=from_tag) + mocker.patch("release_notes_generator.action_inputs.ActionInputs.get_tag_name", return_value=to_tag) + mocker.patch("release_notes_generator.action_inputs.ActionInputs.get_github_repository", return_value="org/repo") + mocker.patch("release_notes_generator.action_inputs.ActionInputs.get_published_at", return_value=prefer_published) + + release_mock = mocker.Mock(spec=GitRelease) + release_mock.created_at = created_at + release_mock.published_at = published_at + release_mock.tag_name = from_tag + mock_repo.get_release.return_value = release_mock + mock_repo.get_issues.return_value = [] + + comparison_mock = mocker.Mock() + comparison_mock.commits = compare_commits if compare_commits is not None else [] + mock_repo.compare.return_value = comparison_mock + + if get_pull_side_effect is not None: + mock_repo.get_pull.side_effect = get_pull_side_effect + else: + mock_repo.get_pull.return_value = mocker.Mock(spec=PullRequest) + + github_mock = mocker.Mock(spec=Github) + github_mock.get_repo.return_value = mock_repo + + miner = DataMiner(github_mock, mocker.Mock()) + miner._safe_call = decorator_mock + return miner + + +def test_mine_data_compare_mode_uses_compare_api(mocker, mock_repo): + commit_mock = mocker.Mock() + commit_mock.sha = "abc123" + commit_mock.commit.message = "Fix service access role (#1363)" + + miner = _make_compare_miner(mocker, mock_repo, compare_commits=[commit_mock]) + data = miner.mine_data() + + mock_repo.compare.assert_called_once_with("v2.6.3", "v2.6.4") + mock_repo.get_commits.assert_not_called() + assert data.compare_commit_shas == {"abc123"} + + +def test_mine_data_compare_mode_fetches_prs_by_number(mocker, mock_repo): + commit_mock = mocker.Mock() + commit_mock.sha = "abc123" + commit_mock.commit.message = "Fix service access role (#42)" + pr_mock = mocker.Mock(spec=PullRequest) + pr_mock.number = 42 + + miner = _make_compare_miner(mocker, mock_repo, compare_commits=[commit_mock], + get_pull_side_effect=lambda n: pr_mock if n == 42 else None) + data = miner.mine_data() + + mock_repo.get_pull.assert_called_once_with(42) + assert pr_mock in data.pull_requests + assert data.pull_requests[pr_mock] == mock_repo + + +def test_mine_data_compare_mode_multiple_prs(mocker, mock_repo): + c1, c2 = mocker.Mock(), mocker.Mock() + c1.sha, c2.sha = "sha1", "sha2" + c1.commit.message = "Feature A (#10)" + c2.commit.message = "Fix B (#20)" + pr10 = mocker.Mock(spec=PullRequest) + pr10.number = 10 + pr20 = mocker.Mock(spec=PullRequest) + pr20.number = 20 + + miner = _make_compare_miner(mocker, mock_repo, compare_commits=[c1, c2], + get_pull_side_effect=lambda n: pr10 if n == 10 else pr20) + data = miner.mine_data() + + assert len(data.pull_requests) == 2 + + +def test_mine_data_compare_mode_leaves_issues_empty(mocker, mock_repo): + miner = _make_compare_miner(mocker, mock_repo, compare_commits=[]) + data = miner.mine_data() + assert data.issues == {} + + +def test_mine_data_compare_mode_sets_data_since(mocker, mock_repo): + miner = _make_compare_miner(mocker, mock_repo, created_at=datetime(2026, 5, 7), prefer_published=False) + data = miner.mine_data() + assert data.since == datetime(2026, 5, 7) + + +def test_mine_data_compare_mode_sets_data_since_published_at(mocker, mock_repo): + miner = _make_compare_miner(mocker, mock_repo, + created_at=datetime(2026, 5, 7), + published_at=datetime(2026, 5, 8), + prefer_published=True) + data = miner.mine_data() + assert data.since == datetime(2026, 5, 8) + + +def test_mine_data_compare_mode_skips_none_prs(mocker, mock_repo): + commit_mock = mocker.Mock() + commit_mock.sha = "dead99" + commit_mock.commit.message = "Fix something (#99)" + + miner = _make_compare_miner(mocker, mock_repo, compare_commits=[commit_mock], + get_pull_side_effect=lambda _: None) + data = miner.mine_data() + + assert data.pull_requests == {} + + +def test_mine_data_compare_mode_no_pr_numbers_in_message(mocker, mock_repo): + commit_mock = mocker.Mock() + commit_mock.sha = "bumpsha" + commit_mock.commit.message = "Bump version to 2.6.4" + + miner = _make_compare_miner(mocker, mock_repo, compare_commits=[commit_mock]) + data = miner.mine_data() + + assert data.pull_requests == {} + assert "bumpsha" in data.compare_commit_shas + + +# --- mine_data timestamp mode (regression) --- + + +def test_mine_data_timestamp_mode_uses_get_commits(mocker, mock_repo): + mocker.patch("release_notes_generator.action_inputs.ActionInputs.is_from_tag_name_defined", return_value=False) + mocker.patch("release_notes_generator.action_inputs.ActionInputs.get_github_repository", return_value="org/repo") + mock_repo.get_issues.return_value = [] + mock_repo.get_pulls.return_value = [] + mock_repo.get_commits.return_value = [] + + github_mock = mocker.Mock(spec=Github) + github_mock.get_repo.return_value = mock_repo + + miner = DataMiner(github_mock, mocker.Mock()) + miner._safe_call = decorator_mock + mocker.patch.object(miner, "get_latest_release", return_value=None) + + miner.mine_data() + + mock_repo.get_commits.assert_called_once() + mock_repo.compare.assert_not_called() + + +def test_mine_data_timestamp_mode_compare_shas_empty(mocker, mock_repo): + mocker.patch("release_notes_generator.action_inputs.ActionInputs.is_from_tag_name_defined", return_value=False) + mocker.patch("release_notes_generator.action_inputs.ActionInputs.get_github_repository", return_value="org/repo") + mock_repo.get_issues.return_value = [] + mock_repo.get_pulls.return_value = [] + mock_repo.get_commits.return_value = [] + + github_mock = mocker.Mock(spec=Github) + github_mock.get_repo.return_value = mock_repo + + miner = DataMiner(github_mock, mocker.Mock()) + miner._safe_call = decorator_mock + mocker.patch.object(miner, "get_latest_release", return_value=None) + + data = miner.mine_data() + + assert data.compare_commit_shas == set() From 965b720f0947a744bbdf8df4c595ac775d186c52 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Mon, 1 Jun 2026 11:37:38 +0200 Subject: [PATCH 2/6] Fixed black. --- release_notes_generator/data/filter.py | 6 ++++-- release_notes_generator/data/miner.py | 12 ++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/release_notes_generator/data/filter.py b/release_notes_generator/data/filter.py index 52852e5d..40cc9450 100644 --- a/release_notes_generator/data/filter.py +++ b/release_notes_generator/data/filter.py @@ -95,7 +95,8 @@ def filter(self, data: MinedData) -> MinedData: pulls_dict[pull] = repo logger.debug( "Count of pulls reduced from %d to %d", - len(data.pull_requests.items()), len(pulls_dict.items()), + len(data.pull_requests.items()), + len(pulls_dict.items()), ) commits_dict = { @@ -105,7 +106,8 @@ def filter(self, data: MinedData) -> MinedData: } logger.debug( "Count of commits reduced from %d to %d", - len(data.commits.items()), len(commits_dict.items()), + len(data.commits.items()), + len(commits_dict.items()), ) md.issues = issues_dict diff --git a/release_notes_generator/data/miner.py b/release_notes_generator/data/miner.py index e086cd19..0eb77597 100644 --- a/release_notes_generator/data/miner.py +++ b/release_notes_generator/data/miner.py @@ -71,11 +71,12 @@ def mine_data(self) -> MinedData: self._get_issues(data) if ActionInputs.is_from_tag_name_defined(): - logger.info("Compare mode: using repo.compare('%s', '%s').", - ActionInputs.get_from_tag_name(), ActionInputs.get_tag_name()) - comparison = self._safe_call(repo.compare)( - ActionInputs.get_from_tag_name(), ActionInputs.get_tag_name() + logger.info( + "Compare mode: using repo.compare('%s', '%s').", + ActionInputs.get_from_tag_name(), + ActionInputs.get_tag_name(), ) + comparison = self._safe_call(repo.compare)(ActionInputs.get_from_tag_name(), ActionInputs.get_tag_name()) compare_commits: list[GithubCommit] = list(comparison.commits) data.compare_commit_shas = {c.sha for c in compare_commits} data.commits = {c: data.home_repository for c in compare_commits} @@ -86,8 +87,7 @@ def mine_data(self) -> MinedData: if pr is not None: pulls[pr] = data.home_repository data.pull_requests = pulls - logger.info("Compare mode: found %d commit(s), %d PR(s).", - len(compare_commits), len(data.pull_requests)) + logger.info("Compare mode: found %d commit(s), %d PR(s).", len(compare_commits), len(data.pull_requests)) else: # pulls and commits, then reduce them by the latest release since time pull_requests = list( From 620c78659e098581f921b483310a590420e595b6 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Mon, 1 Jun 2026 12:02:02 +0200 Subject: [PATCH 3/6] feat: enhance compare mode with total commits handling and warnings --- docs/features/compare_mode.md | 2 +- release_notes_generator/data/miner.py | 30 ++++++++++-- .../data/test_miner.py | 49 ++++++++++++++++++- 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/docs/features/compare_mode.md b/docs/features/compare_mode.md index 9f724c9c..e14f4264 100644 --- a/docs/features/compare_mode.md +++ b/docs/features/compare_mode.md @@ -13,7 +13,7 @@ moment two release streams run in parallel. **Concrete example — two active streams:** -``` +```text develop: * (tag: v2.7.1) 2026-05-20 Improve Kafka consumer throughput (#1401) * (tag: v2.7.0) 2026-05-14 Fix new service access role (#1363) diff --git a/release_notes_generator/data/miner.py b/release_notes_generator/data/miner.py index 0eb77597..ac604dcb 100644 --- a/release_notes_generator/data/miner.py +++ b/release_notes_generator/data/miner.py @@ -31,12 +31,11 @@ from github.Issue import Issue from github.PullRequest import PullRequest from github.Repository import Repository +from github.Commit import Commit as GithubCommit from release_notes_generator.action_inputs import ActionInputs from release_notes_generator.data.utils.bulk_sub_issue_collector import BulkSubIssueCollector -from github.Commit import Commit as GithubCommit -_PR_NUMBER_RE = re.compile(r"\(#(\d+)\)|Merge pull request #(\d+)") from release_notes_generator.model.record.issue_record import IssueRecord from release_notes_generator.model.mined_data import MinedData from release_notes_generator.model.record.pull_request_record import PullRequestRecord @@ -44,6 +43,9 @@ from release_notes_generator.utils.github_rate_limiter import GithubRateLimiter from release_notes_generator.utils.record_utils import get_id, parse_issue_id +_PR_NUMBER_RE = re.compile(r"\(#(\d+)\)|Merge pull request #(\d+)") +_COMPARE_COMMITS_MAX_RESULTS = 10_000 + logger = logging.getLogger(__name__) @@ -77,12 +79,32 @@ def mine_data(self) -> MinedData: ActionInputs.get_tag_name(), ) comparison = self._safe_call(repo.compare)(ActionInputs.get_from_tag_name(), ActionInputs.get_tag_name()) + if comparison is None: + logger.error( + "Compare API returned no result for '%s'...'%s'. Ending!", + ActionInputs.get_from_tag_name(), + ActionInputs.get_tag_name(), + ) + sys.exit(1) compare_commits: list[GithubCommit] = list(comparison.commits) + total_commits = getattr(comparison, "total_commits", None) + if isinstance(total_commits, int) and total_commits > len(compare_commits): + logger.warning( + "Compare mode: retrieved %d commit(s) but comparison reports %d total; results may be truncated.", + len(compare_commits), + total_commits, + ) + elif len(compare_commits) >= _COMPARE_COMMITS_MAX_RESULTS: + logger.warning( + "Compare mode: retrieved %d commit(s); comparison ranges over %d commits may be truncated.", + len(compare_commits), + _COMPARE_COMMITS_MAX_RESULTS, + ) data.compare_commit_shas = {c.sha for c in compare_commits} data.commits = {c: data.home_repository for c in compare_commits} pr_numbers = self._extract_pr_numbers_from_commits(compare_commits) pulls: dict[PullRequest, Repository] = {} - for number in pr_numbers: + for number in sorted(pr_numbers): pr = self._safe_call(repo.get_pull)(number) if pr is not None: pulls[pr] = data.home_repository @@ -458,7 +480,7 @@ def _extract_pr_numbers_from_commits(commits: list[GithubCommit]) -> set[int]: """ pr_numbers: set[int] = set() for commit in commits: - message = commit.commit.message + message = commit.commit.message.splitlines()[0] if commit.commit.message else "" for match in _PR_NUMBER_RE.finditer(message): number_str = match.group(1) or match.group(2) pr_numbers.add(int(number_str)) diff --git a/tests/unit/release_notes_generator/data/test_miner.py b/tests/unit/release_notes_generator/data/test_miner.py index 1c926b97..8bec3453 100644 --- a/tests/unit/release_notes_generator/data/test_miner.py +++ b/tests/unit/release_notes_generator/data/test_miner.py @@ -574,7 +574,7 @@ def test_extract_pr_numbers_multiline_message(mocker): def _make_compare_miner(mocker, mock_repo, *, from_tag="v2.6.3", to_tag="v2.6.4", created_at=datetime(2026, 5, 7), published_at=None, prefer_published=False, - compare_commits=None, get_pull_side_effect=None): + compare_commits=None, get_pull_side_effect=None, total_commits=None): """Wire a DataMiner for compare-mode mine_data calls.""" mocker.patch("release_notes_generator.action_inputs.ActionInputs.is_from_tag_name_defined", return_value=True) mocker.patch("release_notes_generator.action_inputs.ActionInputs.get_from_tag_name", return_value=from_tag) @@ -591,6 +591,8 @@ def _make_compare_miner(mocker, mock_repo, *, from_tag="v2.6.3", to_tag="v2.6.4" comparison_mock = mocker.Mock() comparison_mock.commits = compare_commits if compare_commits is not None else [] + if total_commits is not None: + comparison_mock.total_commits = total_commits mock_repo.compare.return_value = comparison_mock if get_pull_side_effect is not None: @@ -697,6 +699,51 @@ def test_mine_data_compare_mode_no_pr_numbers_in_message(mocker, mock_repo): assert "bumpsha" in data.compare_commit_shas +def test_mine_data_compare_mode_warns_on_total_commits_overflow(mocker, mock_repo): + commit_mock = mocker.Mock() + commit_mock.sha = "abc123" + commit_mock.commit.message = "Fix service access role (#1363)" + warning_mock = mocker.patch("release_notes_generator.data.miner.logger.warning") + + miner = _make_compare_miner( + mocker, + mock_repo, + compare_commits=[commit_mock], + total_commits=10_001, + ) + miner.mine_data() + + warning_mock.assert_called_once_with( + "Compare mode: retrieved %d commit(s) but comparison reports %d total; results may be truncated.", + 1, + 10_001, + ) + + +def test_mine_data_compare_mode_warns_on_retrieval_cap_without_total(mocker, mock_repo): + commits = [] + for i in range(10_000): + commit_mock = mocker.Mock() + commit_mock.sha = f"sha{i}" + commit_mock.commit.message = "Commit without PR ref" + commits.append(commit_mock) + + warning_mock = mocker.patch("release_notes_generator.data.miner.logger.warning") + + miner = _make_compare_miner( + mocker, + mock_repo, + compare_commits=commits, + ) + miner.mine_data() + + warning_mock.assert_called_once_with( + "Compare mode: retrieved %d commit(s); comparison ranges over %d commits may be truncated.", + 10_000, + 10_000, + ) + + # --- mine_data timestamp mode (regression) --- From e8c8f085939ac9c5870021bc072725c35b9f0525 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Mon, 1 Jun 2026 12:06:09 +0200 Subject: [PATCH 4/6] test: update PR number extraction test to handle empty input case --- tests/unit/release_notes_generator/data/test_miner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/release_notes_generator/data/test_miner.py b/tests/unit/release_notes_generator/data/test_miner.py index 8bec3453..97f9e6d8 100644 --- a/tests/unit/release_notes_generator/data/test_miner.py +++ b/tests/unit/release_notes_generator/data/test_miner.py @@ -566,7 +566,7 @@ def test_extract_pr_numbers_empty_input(): def test_extract_pr_numbers_multiline_message(mocker): commit = mocker.Mock() commit.commit.message = "Subject line\n\nFixes behaviour (#77)" - assert DataMiner._extract_pr_numbers_from_commits([commit]) == {77} + assert DataMiner._extract_pr_numbers_from_commits([commit]) == set() # --- mine_data compare mode --- From cf95f48701379e73f043b2753624573b10be6265 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Mon, 1 Jun 2026 13:25:08 +0200 Subject: [PATCH 5/6] Address review comments. --- release_notes_generator/data/SPEC.md | 257 -------------------------- release_notes_generator/data/miner.py | 7 +- 2 files changed, 5 insertions(+), 259 deletions(-) delete mode 100644 release_notes_generator/data/SPEC.md diff --git a/release_notes_generator/data/SPEC.md b/release_notes_generator/data/SPEC.md deleted file mode 100644 index df243ff8..00000000 --- a/release_notes_generator/data/SPEC.md +++ /dev/null @@ -1,257 +0,0 @@ -# SPEC — Compare-mode data mining - -## Problem - -The current `mine_data()` method derives a `since` timestamp from the previous release and fetches all -commits/PRs/issues in that time window. This breaks for branching release histories (e.g. maintenance -`v2.6.x` alongside a `develop`-based `v2.7.x`) because commits from the other branch fall inside the -same window. - -### Concrete example - -Full commit graph (abbreviated): - -``` -develop branch: -* (tag: v2.7.1) 2026-05-20 Improve Kafka consumer throughput (#1401) -* (tag: v2.7.0) 2026-05-14 Fix new service access role (#1363) -* 2026-05-07 Fix/1346 custom hive table (#1349) -* 2026-05-07 Collect latest OpenAPI changes (#1330) -| maintenance/v2.6.x branch: -| * (tag: v2.6.5) 2026-05-20 Backport: handle empty schema in Hive (#1402) -| * (tag: v2.6.4) 2026-05-14 Fix new service access role (#1363) ← cherry-pick -| * (tag: v2.6.3) 2026-05-07 Fix/1346 custom hive table (#1349) ← cherry-pick -|/ -* (tag: v2.6.0) 2026-04-21 Fixes for update-ca-certificates (#1318) -``` - -#### Case 1 — `v2.6.4` (released same day as `v2.7.0`) - -Generating release notes for `v2.6.4` (previous: `v2.6.3`): - -- **Timestamp mode (current):** `since = published_at(v2.6.3) = 2026-05-07`. - `repo.get_commits(since=…)` returns every commit authored between 2026-05-07 and 2026-05-14 - on **any branch** — including `#1349` and `#1330` from develop. Those belong to `v2.7.0`, not - `v2.6.4`, yet they appear in the release notes output. - -- **Compare mode (new):** `repo.compare("v2.6.3", "v2.6.4")` returns only the commits reachable - from `v2.6.4` but **not** from `v2.6.3` — exactly `#1363`. That is the correct set. - -#### Case 2 — `v2.6.5` (released after `v2.7.0` and `v2.7.1`) - -This case is even more broken with the timestamp approach. - -Generating release notes for `v2.6.5` (previous: `v2.6.4`): - -- **Timestamp mode (current):** `since = published_at(v2.6.4) = 2026-05-14`. - `repo.get_commits(since=…)` returns everything authored between 2026-05-14 and 2026-05-20 on - **any branch** — that includes `#1363` (v2.7.0) and `#1401` (v2.7.1) from develop. Neither of - those belongs in the `v2.6.5` patch release notes, but both appear in the output. As more `v2.7.x` - releases accumulate, the noise grows linearly. - -- **Compare mode (new):** `repo.compare("v2.6.4", "v2.6.5")` returns only `#1402` — the single - backport commit unique to `v2.6.5`. That is the correct set, regardless of how many `v2.7.x` - commits exist in the same timestamp window. - -The equivalent Git command the user already uses manually: -``` -git log --oneline --right-only --cherry-pick v2.6.4...v2.6.5 -``` - ---- - -## Solution - -### When compare mode activates - -Compare mode is used when the `from-tag-name` action input is explicitly provided. When it is absent, -the existing timestamp-based path runs unchanged. - -### PyGitHub API used - -`repo.compare(base: str, head: str) → Comparison` - -- Calls `GET /repos/{owner}/{repo}/compare/{base}...{head}` (three-dot diff — commits in `head` not reachable from `base`). -- Available in PyGitHub ≥ 2.x; confirmed present in the project's locked version (`PyGithub==2.9.1`). -- Returns a `github.Comparison.Comparison` object with: - - `commits: PaginatedList[github.Commit.Commit]` — up to 10,000 commits, fully iterable; same `Commit` type already used throughout the codebase. - - `files: list[File]` — first 300 changed files only (not used here). -- The `Commit` objects have the same shape as those returned by `repo.get_commits()`, so no type conversion is needed downstream. - -`repo.get_pull(number: int) → PullRequest` - -- Calls `GET /repos/{owner}/{repo}/pulls/{pull_number}`. -- Returns a full `github.PullRequest.PullRequest` object — same type as those in `data.pull_requests` today. - -### What changes in `DataMiner.mine_data()` - -**Timestamp mode (unchanged path):** -1. Derive `data.since` from the previous release timestamp. -2. `repo.get_commits(since=data.since)` — all commits authored after that timestamp on the default branch. -3. `repo.get_pulls(state="closed", base=default_branch)` — all closed PRs; `FilterByRelease` trims them by `merged_at >= data.since`. -4. `repo.get_issues(since=data.since)` — all issues updated after that timestamp. - -**Compare mode (new path):** -1. Derive `data.since` from the previous release timestamp (same as before — used by `_get_issues` for - unrelated issue fetching, kept for consistency with the rest of the pipeline). -2. `comparison = repo.compare(from_tag, to_tag)` — returns only commits reachable from `to_tag` but - not from `from_tag`. Same semantics as `git log --right-only --cherry-pick from_tag...to_tag`. -3. Iterate `comparison.commits` to collect the commit objects and their SHAs. -4. Extract PR numbers from the commit messages (helper `_extract_pr_numbers_from_commits`): - - Pattern A (squash-merge): `(#NNN)` anywhere in the message → PR number `NNN`. - - Pattern B (merge-commit): `Merge pull request #NNN from …` → PR number `NNN`. - - Both patterns handled by a single regex `\(#(\d+)\)|Merge pull request #(\d+)`. - - Cherry-picks are handled automatically: the commit message on the maintenance branch is - identical to the one on develop, so the PR number is always present regardless of which SHA - was used for the cherry-pick. -5. For each extracted PR number, call `repo.get_pull(number)` to retrieve the full PR object. - `None` results (deleted/missing PRs) are silently skipped. -6. `data.pull_requests` is populated with only those PRs; `data.issues` is left as `{}`. -7. `data.compare_commit_shas` is set to the set of SHAs from step 3. - -### What changes in `FilterByRelease` - -`FilterByRelease` currently trims commits and PRs by timestamp. In compare mode the commit/PR set is -already exact, so timestamp trimming must be skipped. - -Guard: if `data.compare_commit_shas` is non-empty, copy commits and PRs through unchanged; otherwise -run the existing timestamp filter logic. - -### New field on `MinedData` - -`compare_commit_shas: set[str]` — empty by default; set to the SHA strings of all commits returned by -`repo.compare(…)` in compare mode. Acts as the sentinel that tells `FilterByRelease` which path to take. - -### What does NOT change - -- Issue fetching and filtering logic is untouched. -- `data.since` is still derived from the previous release and propagated everywhere that uses it - (e.g. `custom_chapters.since` for release-note extraction date-gating). -- The semantic-version ordering path for finding the previous release is untouched. -- All downstream record building (factory, chapters, builder) is unaffected. - ---- - -## Files changed - -- `release_notes_generator/model/mined_data.py` — add `compare_commit_shas: set[str]` field -- `release_notes_generator/data/miner.py` — add compare-mode branch in `mine_data()`; add helper `_extract_pr_numbers_from_commits()` -- `release_notes_generator/data/filter.py` — skip timestamp filter for PRs and commits when `compare_commit_shas` is non-empty - ---- - -## Test cases (confirmed green ✅) - -All 22 tests implemented and passing (`tests/unit/release_notes_generator/data/`). - -### Helper: `DataMiner._extract_pr_numbers_from_commits` — `test_miner.py` - -| # | Name | Intent | Commit messages | Expected `set[int]` | -|---|------|--------|-----------------|---------------------| -| 1 | `test_extract_pr_numbers_squash_format` | Squash-merge `(#NNN)` tail | `["Fix service access role (#1363)"]` | `{1363}` | -| 2 | `test_extract_pr_numbers_merge_format` | Merge-commit `Merge pull request #NNN` header | `["Merge pull request #1358 from org/branch"]` | `{1358}` | -| 3 | `test_extract_pr_numbers_multiple_commits` | All PR numbers from multiple commits | 3 commits: `"…(#10)"`, `"Merge pull request #20 from …"`, `"…(#30)"` | `{10, 20, 30}` | -| 4 | `test_extract_pr_numbers_deduplicates` | Same PR referenced in two commits is returned once | `["Fix A (#42)", "Follow-up for #42 (#42)"]` | `{42}` | -| 5 | `test_extract_pr_numbers_no_references` | No `#NNN` patterns → empty set | `["Initial commit", "Fix typo in README"]` | `set()` | -| 6 | `test_extract_pr_numbers_empty_input` | Empty commit list → empty set | `[]` | `set()` | -| 7 | `test_extract_pr_numbers_multiline_message` | PR number in second line of message | `["Subject line\n\nFixes behaviour (#77)"]` | `{77}` | - -### `DataMiner.mine_data()` — compare mode — `test_miner.py` - -| # | Name | Intent | Setup | Expected | -|---|------|--------|-------|----------| -| 8 | `test_mine_data_compare_mode_uses_compare_api` | `repo.compare` is called (not `get_commits`) | `is_from_tag_name_defined=True`, from-tag=`v2.6.3`, to-tag=`v2.6.4`; compare returns 1 commit `#1363` | `repo.compare` called once; `repo.get_commits` NOT called; `data.compare_commit_shas == {sha_of_1363}` | -| 9 | `test_mine_data_compare_mode_fetches_prs_by_number` | PRs fetched individually by number from message | Compare commit message `"Fix (#42)"`; `repo.get_pull(42)` returns mock PR | `data.pull_requests == {mock_pr: repo}` | -| 10 | `test_mine_data_compare_mode_multiple_prs` | Two commits → two distinct PRs fetched | Messages `"A (#10)"` and `"B (#20)"`; both `get_pull` calls return valid PRs | `len(data.pull_requests) == 2` | -| 11 | `test_mine_data_compare_mode_leaves_issues_empty` | `data.issues` is `{}` in compare mode | Any compare setup | `data.issues == {}` | -| 12 | `test_mine_data_compare_mode_sets_data_since` | `data.since` still derived from release `created_at` | Release with `created_at=2026-05-07`; `published-at=false` | `data.since == datetime(2026, 5, 7)` | -| 13 | `test_mine_data_compare_mode_sets_data_since_published_at` | `data.since` uses `published_at` when flag set | Release with `published_at=2026-05-08`; `published-at=true` | `data.since == datetime(2026, 5, 8)` | -| 14 | `test_mine_data_compare_mode_skips_none_prs` | `get_pull` returning `None` silently omitted | Compare commit refs PR #99; `get_pull(99)` returns None | `data.pull_requests == {}` | -| 15 | `test_mine_data_compare_mode_no_pr_numbers_in_message` | Commit with no PR ref produces no PRs | Commit message `"Bump version to 2.6.4"` | `data.pull_requests == {}`; `data.compare_commit_shas` non-empty | - -### `DataMiner.mine_data()` — timestamp mode (regression) — `test_miner.py` - -| # | Name | Intent | Setup | Expected | -|---|------|--------|-------|----------| -| 16 | `test_mine_data_timestamp_mode_uses_get_commits` | Timestamp path calls `get_commits(since=…)` not `compare` | `is_from_tag_name_defined=False`; `data.since` set | `repo.get_commits` called with `since=data.since`; `repo.compare` NOT called | -| 17 | `test_mine_data_timestamp_mode_compare_shas_empty` | `compare_commit_shas` is empty set in timestamp mode | `is_from_tag_name_defined=False` | `data.compare_commit_shas == set()` | - -### `FilterByRelease` — compare mode guard — `test_filter.py` - -| # | Name | Intent | Setup | Expected | -|---|------|--------|-------|----------| -| 18 | `test_filter_compare_mode_passes_prs_through` | PRs kept regardless of `merged_at` when in compare mode | `compare_commit_shas` non-empty; PR `merged_at` is 30 days before `data.since` | PR present in `filtered.pull_requests` | -| 19 | `test_filter_compare_mode_passes_commits_through` | Commits kept regardless of author date when in compare mode | `compare_commit_shas` non-empty; commit author date is 30 days before `data.since` | Commit present in `filtered.commits` | -| 20 | `test_filter_compare_mode_passes_multiple_prs_and_commits` | All PRs and commits pass through unfiltered | `compare_commit_shas` non-empty; 3 PRs and 2 commits all dated before `data.since` | All 3 PRs and 2 commits in filtered result | -| 21 | `test_filter_timestamp_mode_filters_old_prs` | Timestamp mode: PR before `since` is excluded | `compare_commit_shas == set()`; PR `merged_at` before `data.since` | PR absent from `filtered.pull_requests` | -| 22 | `test_filter_timestamp_mode_keeps_recent_prs` | Timestamp mode: PR after `since` is kept | `compare_commit_shas == set()`; PR `merged_at` after `data.since` | PR present in `filtered.pull_requests` (regression guard) | - ---- - -## Proposed documentation updates - -These changes should be applied to the user-facing docs once the feature ships. - -### `docs/features/tag_range.md` - -Replace the **How It Works** section with: - -```markdown -## How It Works - -### Timestamp mode (default when `from-tag-name` is omitted) -- The previous release's timestamp is used as the `since` boundary. -- Commits, PRs, and issues updated after that timestamp are fetched. -- **Limitation:** breaks for branching release histories — commits from other active branches - (e.g. `v2.7.x`) fall inside the same timestamp window and appear in the patch release notes. - -### Compare mode (activated when `from-tag-name` is explicitly provided) -- Uses the GitHub Compare API (`GET /repos/{owner}/{repo}/compare/{from-tag}...{to-tag}`). -- Returns only commits reachable from `tag-name` but **not** from `from-tag-name`, regardless - of timestamp or branch. -- PR numbers are extracted from those commit messages; only the matching PRs are fetched. -- Cherry-picks are handled automatically: the commit message on the maintenance branch preserves - the original PR reference. -- Equivalent: `git log --oneline --right-only --cherry-pick ...` - -**Example** — `v2.6.5` (previous: `v2.6.4`): - -​``` -develop: -* (tag: v2.7.1) 2026-05-20 Improve Kafka consumer throughput (#1401) -* (tag: v2.7.0) 2026-05-14 Fix new service access role (#1363) -maintenance/v2.6.x: -| * (tag: v2.6.5) 2026-05-20 Backport: handle empty schema in Hive (#1402) -| * (tag: v2.6.4) 2026-05-14 Fix new service access role (#1363) ← cherry-pick -|/ -* (tag: v2.6.0) 2026-04-21 … -​``` - -- **Timestamp mode** fetches commits between 2026-05-14 and 2026-05-20 on any branch → captures - `#1363` and `#1401` from develop. Incorrect. -- **Compare mode** (`repo.compare("v2.6.4", "v2.6.5")`) returns only `#1402`. Correct. - -### Other behaviour (both modes) -- Tag normalization: `1.6.0` and `v1.6.0` are both accepted. -- `data.since` is still derived from the previous release in compare mode (used for issue fetching - and release-note extraction date-gating). -- If no prior release exists (first release), all issues/PRs are included. -``` - -Also update the **Configuration** example to show `from-tag-name: "v2.6.4"` and the -**Example Result** to show only `#1402`. - ---- - -### `docs/configuration_reference.md` - -Update the `from-tag-name` row description from: - -> `Explicit previous release tag; if empty semantic latest published release is used.` - -To: - -> `Explicit previous release tag. When provided, activates compare mode (graph-based commit -> selection via GitHub Compare API), which is correct for branching release histories -> (e.g. maintenance v2.6.x alongside v2.7.x). If empty, semantic latest published release is -> used with timestamp-based filtering. See [Tag Range](../docs/features/tag_range.md).` diff --git a/release_notes_generator/data/miner.py b/release_notes_generator/data/miner.py index ac604dcb..5f9aea19 100644 --- a/release_notes_generator/data/miner.py +++ b/release_notes_generator/data/miner.py @@ -473,6 +473,9 @@ def _extract_pr_numbers_from_commits(commits: list[GithubCommit]) -> set[int]: """ Extract unique PR numbers from commit messages. + Note: Only the first line (subject) of each commit message is scanned to avoid matching + references in the commit body. + Parameters: commits: Commit objects whose messages are scanned. Returns: @@ -480,8 +483,8 @@ def _extract_pr_numbers_from_commits(commits: list[GithubCommit]) -> set[int]: """ pr_numbers: set[int] = set() for commit in commits: - message = commit.commit.message.splitlines()[0] if commit.commit.message else "" - for match in _PR_NUMBER_RE.finditer(message): + subject = commit.commit.message.splitlines()[0] if commit.commit.message else "" + for match in _PR_NUMBER_RE.finditer(subject): number_str = match.group(1) or match.group(2) pr_numbers.add(int(number_str)) return pr_numbers From d9a4cced22aa4c0c57b1850ef4cb3e84bac3171c Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 2 Jun 2026 10:59:18 +0200 Subject: [PATCH 6/6] Fixed review comment. --- tests/unit/release_notes_generator/data/test_miner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/release_notes_generator/data/test_miner.py b/tests/unit/release_notes_generator/data/test_miner.py index 97f9e6d8..2c6a3554 100644 --- a/tests/unit/release_notes_generator/data/test_miner.py +++ b/tests/unit/release_notes_generator/data/test_miner.py @@ -700,6 +700,7 @@ def test_mine_data_compare_mode_no_pr_numbers_in_message(mocker, mock_repo): def test_mine_data_compare_mode_warns_on_total_commits_overflow(mocker, mock_repo): + """Test that a warning is logged when the compare API returns more commits than it can retrieve (over 10,000).""" commit_mock = mocker.Mock() commit_mock.sha = "abc123" commit_mock.commit.message = "Fix service access role (#1363)" @@ -721,6 +722,7 @@ def test_mine_data_compare_mode_warns_on_total_commits_overflow(mocker, mock_rep def test_mine_data_compare_mode_warns_on_retrieval_cap_without_total(mocker, mock_repo): + """Test that a warning is logged when the compare API reaches the retrieval cap without a total commit count.""" commits = [] for i in range(10_000): commit_mock = mocker.Mock()