Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.12
22 changes: 19 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,22 @@ You can follow these steps to generate a PageIndex tree from a PDF document.

### 1. Install dependencies

Install the package (along with its pinned dependencies) from the repository root:

```bash
pip3 install --upgrade .
Comment on lines +152 to +155
```

Or install it directly from GitHub to use it in another project:

```bash
pip3 install --upgrade -r requirements.txt
pip3 install "git+https://github.com/VectifyAI/PageIndex.git"
```

To run the agentic vectorless RAG example, install the optional extras:

```bash
pip3 install --upgrade ".[examples]"
```

### 2. Set your LLM API key
Expand All @@ -164,9 +178,11 @@ OPENAI_API_KEY=your_openai_key_here
### 3. Generate PageIndex structure for your PDF

```bash
python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
pageindex --pdf_path /path/to/your/document.pdf
```

> The `pageindex` command is installed with the package. You can also run it without installing via `python3 run_pageindex.py --pdf_path /path/to/your/document.pdf`.

<details>
<summary>Optional parameters</summary>
<br>
Expand All @@ -189,7 +205,7 @@ You can customize the processing with additional optional arguments:
We also provide markdown support for PageIndex. You can use the `--md_path` flag to generate a tree structure for a markdown file.

```bash
python3 run_pageindex.py --md_path /path/to/your/document.md
pageindex --md_path /path/to/your/document.md
```

> Note: in this mode, we use "#" to determine node headings and their levels. For example, "##" is level 2, "###" is level 3, etc. Make sure your markdown file is formatted correctly. If your Markdown file was converted from a PDF or HTML, we don't recommend using this mode, since most existing conversion tools cannot preserve the original hierarchy. Instead, use our [PageIndex OCR](https://pageindex.ai/blog/ocr), which is designed to preserve it, to convert the PDF to a markdown file and then use this mode.
Expand Down
147 changes: 147 additions & 0 deletions pageindex/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import argparse
import asyncio
import json
import os

from pageindex import page_index_main
from pageindex.page_index_md import md_to_tree
from pageindex.utils import ConfigLoader


def build_parser():
parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')

parser.add_argument('--model', type=str, default=None, help='Model to use (overrides config.yaml)')

parser.add_argument('--toc-check-pages', type=int, default=None,
help='Number of pages to check for table of contents (PDF only)')
parser.add_argument('--max-pages-per-node', type=int, default=None,
help='Maximum number of pages per node (PDF only)')
parser.add_argument('--max-tokens-per-node', type=int, default=None,
help='Maximum number of tokens per node (PDF only)')

parser.add_argument('--if-add-node-id', type=str, default=None,
help='Whether to add node id to the node')
parser.add_argument('--if-add-node-summary', type=str, default=None,
help='Whether to add summary to the node')
parser.add_argument('--if-add-doc-description', type=str, default=None,
help='Whether to add doc description to the doc')
parser.add_argument('--if-add-node-text', type=str, default=None,
help='Whether to add text to the node')

# Markdown specific arguments
parser.add_argument('--if-thinning', type=str, default='no',
help='Whether to apply tree thinning for markdown (markdown only)')
parser.add_argument('--thinning-threshold', type=int, default=5000,
help='Minimum token threshold for thinning (markdown only)')
parser.add_argument('--summary-token-threshold', type=int, default=200,
help='Token threshold for generating summaries (markdown only)')
return parser


def _process_pdf(args):
# Validate PDF file
if not args.pdf_path.lower().endswith('.pdf'):
raise ValueError("PDF file must have .pdf extension")
if not os.path.isfile(args.pdf_path):
raise ValueError(f"PDF file not found: {args.pdf_path}")

# Process PDF file
user_opt = {
'model': args.model,
'toc_check_page_num': args.toc_check_pages,
'max_page_num_each_node': args.max_pages_per_node,
'max_token_num_each_node': args.max_tokens_per_node,
'if_add_node_id': args.if_add_node_id,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
}
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})

# Process the PDF
toc_with_page_number = page_index_main(args.pdf_path, opt)
print('Parsing done, saving to file...')

# Save results
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{pdf_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2)

print(f'Tree structure saved to: {output_file}')


def _process_markdown(args):
# Validate Markdown file
if not args.md_path.lower().endswith(('.md', '.markdown')):
raise ValueError("Markdown file must have .md or .markdown extension")
if not os.path.isfile(args.md_path):
raise ValueError(f"Markdown file not found: {args.md_path}")

# Process markdown file
print('Processing markdown file...')

# Use ConfigLoader to get consistent defaults (matching PDF behavior)
config_loader = ConfigLoader()

# Create options dict with user args
user_opt = {
'model': args.model,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
'if_add_node_id': args.if_add_node_id
}

# Load config with defaults from config.yaml
opt = config_loader.load(user_opt)

toc_with_page_number = asyncio.run(md_to_tree(
md_path=args.md_path,
if_thinning=args.if_thinning.lower() == 'yes',
min_token_threshold=args.thinning_threshold,
if_add_node_summary=opt.if_add_node_summary,
summary_token_threshold=args.summary_token_threshold,
model=opt.model,
if_add_doc_description=opt.if_add_doc_description,
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id
))

print('Parsing done, saving to file...')

# Save results
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{md_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')


def main(argv=None):
args = build_parser().parse_args(argv)

# Validate that exactly one file type is specified
if not args.pdf_path and not args.md_path:
raise ValueError("Either --pdf_path or --md_path must be specified")
if args.pdf_path and args.md_path:
raise ValueError("Only one of --pdf_path or --md_path can be specified")
Comment on lines +135 to +138

if args.pdf_path:
_process_pdf(args)
else:
_process_markdown(args)


if __name__ == "__main__":
main()
29 changes: 29 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[project]
Comment thread
trehansalil marked this conversation as resolved.
name = "pageindex"
version = "0.1.0"
Comment thread
trehansalil marked this conversation as resolved.
description = "Tools for indexing and processing pages in PDF and related documents."
readme = "README.md"
license = { text = "Proprietary" }
authors = [{ name = "pageindex developers" }]
requires-python = ">=3.12"
dependencies = [
"litellm>=1.83.7",
"pymupdf>=1.26.4",
"PyPDF2>=3.0.1",
"python-dotenv>=1.2.2",
"pyyaml>=6.0.2",
]
Comment on lines +8 to +15

[project.optional-dependencies]
# Required for examples/agentic_vectorless_rag_demo.py
examples = ["openai-agents"]

[project.scripts]
pageindex = "pageindex.cli:main"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["pageindex"]
6 changes: 0 additions & 6 deletions requirements.txt

This file was deleted.

134 changes: 2 additions & 132 deletions run_pageindex.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,4 @@
import argparse
import os
import json
from pageindex import *
from pageindex.page_index_md import md_to_tree
from pageindex.utils import ConfigLoader
from pageindex.cli import main

if __name__ == "__main__":
# Set up argument parser
parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')

parser.add_argument('--model', type=str, default=None, help='Model to use (overrides config.yaml)')

parser.add_argument('--toc-check-pages', type=int, default=None,
help='Number of pages to check for table of contents (PDF only)')
parser.add_argument('--max-pages-per-node', type=int, default=None,
help='Maximum number of pages per node (PDF only)')
parser.add_argument('--max-tokens-per-node', type=int, default=None,
help='Maximum number of tokens per node (PDF only)')

parser.add_argument('--if-add-node-id', type=str, default=None,
help='Whether to add node id to the node')
parser.add_argument('--if-add-node-summary', type=str, default=None,
help='Whether to add summary to the node')
parser.add_argument('--if-add-doc-description', type=str, default=None,
help='Whether to add doc description to the doc')
parser.add_argument('--if-add-node-text', type=str, default=None,
help='Whether to add text to the node')

# Markdown specific arguments
parser.add_argument('--if-thinning', type=str, default='no',
help='Whether to apply tree thinning for markdown (markdown only)')
parser.add_argument('--thinning-threshold', type=int, default=5000,
help='Minimum token threshold for thinning (markdown only)')
parser.add_argument('--summary-token-threshold', type=int, default=200,
help='Token threshold for generating summaries (markdown only)')
args = parser.parse_args()

# Validate that exactly one file type is specified
if not args.pdf_path and not args.md_path:
raise ValueError("Either --pdf_path or --md_path must be specified")
if args.pdf_path and args.md_path:
raise ValueError("Only one of --pdf_path or --md_path can be specified")

if args.pdf_path:
# Validate PDF file
if not args.pdf_path.lower().endswith('.pdf'):
raise ValueError("PDF file must have .pdf extension")
if not os.path.isfile(args.pdf_path):
raise ValueError(f"PDF file not found: {args.pdf_path}")

# Process PDF file
user_opt = {
'model': args.model,
'toc_check_page_num': args.toc_check_pages,
'max_page_num_each_node': args.max_pages_per_node,
'max_token_num_each_node': args.max_tokens_per_node,
'if_add_node_id': args.if_add_node_id,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
}
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})

# Process the PDF
toc_with_page_number = page_index_main(args.pdf_path, opt)
print('Parsing done, saving to file...')

# Save results
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{pdf_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2)

print(f'Tree structure saved to: {output_file}')

elif args.md_path:
# Validate Markdown file
if not args.md_path.lower().endswith(('.md', '.markdown')):
raise ValueError("Markdown file must have .md or .markdown extension")
if not os.path.isfile(args.md_path):
raise ValueError(f"Markdown file not found: {args.md_path}")

# Process markdown file
print('Processing markdown file...')

# Process the markdown
import asyncio

# Use ConfigLoader to get consistent defaults (matching PDF behavior)
from pageindex.utils import ConfigLoader
config_loader = ConfigLoader()

# Create options dict with user args
user_opt = {
'model': args.model,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
'if_add_node_id': args.if_add_node_id
}

# Load config with defaults from config.yaml
opt = config_loader.load(user_opt)

toc_with_page_number = asyncio.run(md_to_tree(
md_path=args.md_path,
if_thinning=args.if_thinning.lower() == 'yes',
min_token_threshold=args.thinning_threshold,
if_add_node_summary=opt.if_add_node_summary,
summary_token_threshold=args.summary_token_threshold,
model=opt.model,
if_add_doc_description=opt.if_add_doc_description,
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id
))

print('Parsing done, saving to file...')

# Save results
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{md_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')
main()
Loading