Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions pageindex/retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,17 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
"""
For Markdown documents, 'pages' are line numbers.
Find nodes whose line_num falls within [min(page_nums), max(page_nums)] and return their text.
Return text for nodes whose line_num matches one of the requested numbers.
Mirrors the PDF branch: '3,8' returns lines 3 and 8, not lines 3..8.
"""
min_line, max_line = min(page_nums), max(page_nums)
wanted = set(page_nums)
results = []
seen = set()

def _traverse(nodes):
for node in nodes:
ln = node.get('line_num')
if ln and min_line <= ln <= max_line and ln not in seen:
if ln in wanted and ln not in seen:
seen.add(ln)
results.append({'page': ln, 'content': node.get('text', '')})
if node.get('nodes'):
Expand Down
54 changes: 54 additions & 0 deletions tests/test_retrieve_pages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Regression tests for pageindex.retrieve page selection."""
import json

from pageindex.retrieve import get_page_content


def _docs():
"""Markdown and PDF documents with headings/pages at the same positions."""
return {
'D_MD': {
'type': 'md',
'structure': [
{'line_num': 5, 'text': 'L5', 'nodes': []},
{'line_num': 10, 'text': 'L10', 'nodes': []},
{'line_num': 50, 'text': 'L50', 'nodes': []},
{'line_num': 100, 'text': 'L100', 'nodes': []},
],
},
'D_PDF': {
'type': 'pdf',
'pages': [
{'page': 5, 'content': 'P5'},
{'page': 10, 'content': 'P10'},
{'page': 50, 'content': 'P50'},
{'page': 100, 'content': 'P100'},
],
},
}


def _pages(result_json):
return sorted(r['page'] for r in json.loads(result_json))


def test_md_comma_returns_only_requested_pages():
docs = _docs()
assert _pages(get_page_content(docs, 'D_MD', '5,100')) == [5, 100]


def test_md_and_pdf_agree_on_comma_separated_pages():
docs = _docs()
md = _pages(get_page_content(docs, 'D_MD', '5,100'))
pdf = _pages(get_page_content(docs, 'D_PDF', '5,100'))
assert md == pdf == [5, 100]


def test_md_range_still_returns_nodes_inside_range():
docs = _docs()
assert _pages(get_page_content(docs, 'D_MD', '5-50')) == [5, 10, 50]


def test_md_single_page_returns_just_that_node():
docs = _docs()
assert _pages(get_page_content(docs, 'D_MD', '50')) == [50]