diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index 55c38509c..6ceb14df8 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -56,16 +56,17 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: """ For Markdown documents, 'pages' are line numbers. - Find nodes whose line_num falls within [min(page_nums), max(page_nums)] and return their text. + Return text for nodes whose line_num matches one of the requested numbers. + Mirrors the PDF branch: '3,8' returns lines 3 and 8, not lines 3..8. """ - min_line, max_line = min(page_nums), max(page_nums) + wanted = set(page_nums) results = [] seen = set() def _traverse(nodes): for node in nodes: ln = node.get('line_num') - if ln and min_line <= ln <= max_line and ln not in seen: + if ln in wanted and ln not in seen: seen.add(ln) results.append({'page': ln, 'content': node.get('text', '')}) if node.get('nodes'): diff --git a/tests/test_retrieve_pages.py b/tests/test_retrieve_pages.py new file mode 100644 index 000000000..b1cd9e170 --- /dev/null +++ b/tests/test_retrieve_pages.py @@ -0,0 +1,54 @@ +"""Regression tests for pageindex.retrieve page selection.""" +import json + +from pageindex.retrieve import get_page_content + + +def _docs(): + """Markdown and PDF documents with headings/pages at the same positions.""" + return { + 'D_MD': { + 'type': 'md', + 'structure': [ + {'line_num': 5, 'text': 'L5', 'nodes': []}, + {'line_num': 10, 'text': 'L10', 'nodes': []}, + {'line_num': 50, 'text': 'L50', 'nodes': []}, + {'line_num': 100, 'text': 'L100', 'nodes': []}, + ], + }, + 'D_PDF': { + 'type': 'pdf', + 'pages': [ + {'page': 5, 'content': 'P5'}, + {'page': 10, 'content': 'P10'}, + {'page': 50, 'content': 'P50'}, + {'page': 100, 'content': 'P100'}, + ], + }, + } + + +def _pages(result_json): + return sorted(r['page'] for r in json.loads(result_json)) + + +def test_md_comma_returns_only_requested_pages(): + docs = _docs() + assert _pages(get_page_content(docs, 'D_MD', '5,100')) == [5, 100] + + +def test_md_and_pdf_agree_on_comma_separated_pages(): + docs = _docs() + md = _pages(get_page_content(docs, 'D_MD', '5,100')) + pdf = _pages(get_page_content(docs, 'D_PDF', '5,100')) + assert md == pdf == [5, 100] + + +def test_md_range_still_returns_nodes_inside_range(): + docs = _docs() + assert _pages(get_page_content(docs, 'D_MD', '5-50')) == [5, 10, 50] + + +def test_md_single_page_returns_just_that_node(): + docs = _docs() + assert _pages(get_page_content(docs, 'D_MD', '50')) == [50]