Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@ __pycache__
.env*
.venv/
logs/

build/
dist/
*.egg-info/
3 changes: 3 additions & 0 deletions PageIndex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from pageindex.pageindex_runner import PageIndex, PageIndexConfig

__all__ = ["PageIndex", "PageIndexConfig"]
55 changes: 50 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,23 +141,68 @@ You can generate the PageIndex tree structure with this open-source repo, or use

# ⚙️ Package Usage

You can follow these steps to generate a PageIndex tree from a PDF document.
You can install and use PageIndex either as a Python package (class API) or via CLI.

### 1. Install dependencies
### 1. Install

From source:

```bash
pip3 install .
```

For development (editable install):

```bash
pip3 install --upgrade -r requirements.txt
pip3 install -e .
```

### 2. Set your LLM API key

Create a `.env` file in the root directory with your LLM API key, with multi-LLM support via [LiteLLM](https://docs.litellm.ai/docs/providers):
Create a `.env` file in your working directory with your LLM API key, with multi-LLM support via [LiteLLM](https://docs.litellm.ai/docs/providers):

```bash
OPENAI_API_KEY=your_openai_key_here
```

### 3. Generate PageIndex structure for your PDF
### 3. Use the class API (recommended for integration)

```python
from pageindex import PageIndex, PageIndexConfig

config = PageIndexConfig(
pdf_path="/path/to/your/document.pdf",
model="gpt-4o-2024-11-20", # optional
add_node_summary=True, # optional
add_doc_description=True, # optional
add_node_text=False, # optional
add_node_id=True, # optional
output_dir="./results" # optional
)

# Get result in memory
result = PageIndex(config).run()

# Or run and save as JSON
output_file = PageIndex(config).run_and_save()
print(output_file)
```

Markdown input is also supported:

```python
from pageindex import PageIndex, PageIndexConfig

config = PageIndexConfig(
md_path="/path/to/your/document.md",
if_thinning=False, # optional
thinning_threshold=5000, # optional
summary_token_threshold=200, # optional
)
result = PageIndex(config).run()
```

### 4. Use the CLI

```bash
python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
Expand Down
15 changes: 11 additions & 4 deletions pageindex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from .page_index import *
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .client import PageIndexClient
from .pageindex_runner import PageIndex, PageIndexConfig

try:
from .page_index import *
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .client import PageIndexClient
except ModuleNotFoundError:
# Allows importing lightweight APIs (e.g. PageIndex config/validation)
# before runtime dependencies are installed.
pass
140 changes: 140 additions & 0 deletions pageindex/pageindex_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import asyncio
import json
import os
from dataclasses import dataclass
from typing import Any, Dict, Optional


@dataclass
class PageIndexConfig:
pdf_path: Optional[str] = None
md_path: Optional[str] = None
model: Optional[str] = None
toc_check_pages: Optional[int] = None
max_pages_per_node: Optional[int] = None
max_tokens_per_node: Optional[int] = None
add_node_id: Optional[bool] = None
add_node_summary: Optional[bool] = None
add_doc_description: Optional[bool] = None
add_node_text: Optional[bool] = None
if_thinning: bool = False
thinning_threshold: int = 5000
summary_token_threshold: int = 200
output_dir: str = "./results"


class PageIndex:
"""Public API for generating structure from PDF or Markdown documents."""

def __init__(self, config: PageIndexConfig):
self.config = config
self._doc_kind: Optional[str] = None
self._doc_path: Optional[str] = None
self._validate_and_resolve_input()

def run(self) -> Dict[str, Any]:
if self._is_pdf():
return self._process_pdf()
if self._is_markdown():
return self._process_markdown()
raise ValueError("Input file must be a PDF or Markdown document.")

def run_and_save(self) -> str:
result = self.run()
output_file = self._build_output_file()
os.makedirs(self.config.output_dir, exist_ok=True)
with open(output_file, "w", encoding="utf-8") as file:
json.dump(result, file, indent=2, ensure_ascii=False)
return output_file

def _process_pdf(self) -> Dict[str, Any]:
from pageindex.page_index import page_index_main
from pageindex.utils import ConfigLoader

user_opt = {
"model": self.config.model,
"toc_check_page_num": self.config.toc_check_pages,
"max_page_num_each_node": self.config.max_pages_per_node,
"max_token_num_each_node": self.config.max_tokens_per_node,
"if_add_node_id": self._to_yes_no(self.config.add_node_id),
"if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
"if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
"if_add_node_text": self._to_yes_no(self.config.add_node_text),
}
opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})
return page_index_main(self._doc_path, opt)

def _process_markdown(self) -> Dict[str, Any]:
from pageindex.page_index_md import md_to_tree
from pageindex.utils import ConfigLoader

user_opt = {
"model": self.config.model,
"if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
"if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
"if_add_node_text": self._to_yes_no(self.config.add_node_text),
"if_add_node_id": self._to_yes_no(self.config.add_node_id),
}
opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})

return asyncio.run(
md_to_tree(
md_path=self._doc_path,
if_thinning=self.config.if_thinning,
min_token_threshold=self.config.thinning_threshold,
if_add_node_summary=opt.if_add_node_summary,
summary_token_threshold=self.config.summary_token_threshold,
model=opt.model,
if_add_doc_description=opt.if_add_doc_description,
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id,
)
)

def _build_output_file(self) -> str:
base_name = os.path.splitext(os.path.basename(self._doc_path))[0]
return os.path.join(self.config.output_dir, f"{base_name}_structure.json")

def _validate_and_resolve_input(self) -> None:
pdf_path = self.config.pdf_path
md_path = self.config.md_path
if not pdf_path and not md_path:
raise ValueError("Either --pdf_path or --md_path must be specified")
if pdf_path and md_path:
raise ValueError("Only one of --pdf_path or --md_path can be specified")

if pdf_path:
self._validate_pdf(pdf_path)
self._doc_kind = "pdf"
self._doc_path = pdf_path
return

self._validate_markdown(md_path)
self._doc_kind = "markdown"
self._doc_path = md_path

@staticmethod
def _validate_pdf(path: str) -> None:
if not path.lower().endswith(".pdf"):
raise ValueError("PDF file must have .pdf extension")
if not os.path.isfile(path):
raise ValueError(f"PDF file not found: {path}")

@staticmethod
def _validate_markdown(path: str) -> None:
if not path.lower().endswith((".md", ".markdown")):
raise ValueError("Markdown file must have .md or .markdown extension")
if not os.path.isfile(path):
raise ValueError(f"Markdown file not found: {path}")

def _is_pdf(self) -> bool:
return self._doc_kind == "pdf"

def _is_markdown(self) -> bool:
return self._doc_kind == "markdown"

@staticmethod
def _to_yes_no(value: Optional[bool]) -> Optional[str]:
if value is None:
return None
return "yes" if value else "no"
23 changes: 23 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "pageindex"
version = "0.1.0"
description = "Vectorless, reasoning-based RAG with hierarchical document indexing"
readme = "README.md"
requires-python = ">=3.9"
dependencies = [
"litellm==1.83.0",
"pymupdf==1.26.4",
"PyPDF2==3.0.1",
"python-dotenv==1.1.0",
"pyyaml==6.0.2"
]

[tool.setuptools.packages.find]
include = ["pageindex*"]

[tool.setuptools.package-data]
pageindex = ["config.yaml"]
Loading