VectifyAI · ranpariyachetan · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,7 @@ __pycache__
 .env*
 .venv/
 logs/
+
+build/
+dist/
+*.egg-info/
diff --git a/PageIndex.py b/PageIndex.py
@@ -0,0 +1,3 @@
+from pageindex.pageindex_runner import PageIndex, PageIndexConfig
+
+__all__ = ["PageIndex", "PageIndexConfig"]
diff --git a/README.md b/README.md
@@ -141,23 +141,68 @@ You can generate the PageIndex tree structure with this open-source repo, or use
 
 # ⚙️ Package Usage
 
-You can follow these steps to generate a PageIndex tree from a PDF document.
+You can install and use PageIndex either as a Python package (class API) or via CLI.
 
-### 1. Install dependencies
+### 1. Install
+
+From source:
+
+```bash
+pip3 install .
+```
+
+For development (editable install):
 
 ```bash
-pip3 install --upgrade -r requirements.txt
+pip3 install -e .
 ```
 
 ### 2. Set your LLM API key
 
-Create a `.env` file in the root directory with your LLM API key, with multi-LLM support via [LiteLLM](https://docs.litellm.ai/docs/providers):
+Create a `.env` file in your working directory with your LLM API key, with multi-LLM support via [LiteLLM](https://docs.litellm.ai/docs/providers):
 
 ```bash
 OPENAI_API_KEY=your_openai_key_here
 ```
 
-### 3. Generate PageIndex structure for your PDF
+### 3. Use the class API (recommended for integration)
+
+```python
+from pageindex import PageIndex, PageIndexConfig
+
+config = PageIndexConfig(
+    pdf_path="/path/to/your/document.pdf",
+    model="gpt-4o-2024-11-20",   # optional
+    add_node_summary=True,       # optional
+    add_doc_description=True,    # optional
+    add_node_text=False,         # optional
+    add_node_id=True,            # optional
+    output_dir="./results"       # optional
+)
+
+# Get result in memory
+result = PageIndex(config).run()
+
+# Or run and save as JSON
+output_file = PageIndex(config).run_and_save()
+print(output_file)
+```
+
+Markdown input is also supported:
+
+```python
+from pageindex import PageIndex, PageIndexConfig
+
+config = PageIndexConfig(
+    md_path="/path/to/your/document.md",
+    if_thinning=False,            # optional
+    thinning_threshold=5000,      # optional
+    summary_token_threshold=200,  # optional
+)
+result = PageIndex(config).run()
+```
+
+### 4. Use the CLI
 
 ```bash
 python3 run_pageindex.py --pdf_path /path/to/your/document.pdf

diff --git a/pageindex/__init__.py b/pageindex/__init__.py
@@ -1,4 +1,11 @@
-from .page_index import *
-from .page_index_md import md_to_tree
-from .retrieve import get_document, get_document_structure, get_page_content
-from .client import PageIndexClient
+from .pageindex_runner import PageIndex, PageIndexConfig
+
+try:
+    from .page_index import *
+    from .page_index_md import md_to_tree
+    from .retrieve import get_document, get_document_structure, get_page_content
+    from .client import PageIndexClient
+except ModuleNotFoundError:
+    # Allows importing lightweight APIs (e.g. PageIndex config/validation)
+    # before runtime dependencies are installed.
+    pass
diff --git a/pageindex/pageindex_runner.py b/pageindex/pageindex_runner.py
@@ -0,0 +1,140 @@
+import asyncio
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class PageIndexConfig:
+    pdf_path: Optional[str] = None
+    md_path: Optional[str] = None
+    model: Optional[str] = None
+    toc_check_pages: Optional[int] = None
+    max_pages_per_node: Optional[int] = None
+    max_tokens_per_node: Optional[int] = None
+    add_node_id: Optional[bool] = None
+    add_node_summary: Optional[bool] = None
+    add_doc_description: Optional[bool] = None
+    add_node_text: Optional[bool] = None
+    if_thinning: bool = False
+    thinning_threshold: int = 5000
+    summary_token_threshold: int = 200
+    output_dir: str = "./results"
+
+
+class PageIndex:
+    """Public API for generating structure from PDF or Markdown documents."""
+
+    def __init__(self, config: PageIndexConfig):
+        self.config = config
+        self._doc_kind: Optional[str] = None
+        self._doc_path: Optional[str] = None
+        self._validate_and_resolve_input()
+
+    def run(self) -> Dict[str, Any]:
+        if self._is_pdf():
+            return self._process_pdf()
+        if self._is_markdown():
+            return self._process_markdown()
+        raise ValueError("Input file must be a PDF or Markdown document.")
+
+    def run_and_save(self) -> str:
+        result = self.run()
+        output_file = self._build_output_file()
+        os.makedirs(self.config.output_dir, exist_ok=True)
+        with open(output_file, "w", encoding="utf-8") as file:
+            json.dump(result, file, indent=2, ensure_ascii=False)
+        return output_file
+
+    def _process_pdf(self) -> Dict[str, Any]:
+        from pageindex.page_index import page_index_main
+        from pageindex.utils import ConfigLoader
+
+        user_opt = {
+            "model": self.config.model,
+            "toc_check_page_num": self.config.toc_check_pages,
+            "max_page_num_each_node": self.config.max_pages_per_node,
+            "max_token_num_each_node": self.config.max_tokens_per_node,
+            "if_add_node_id": self._to_yes_no(self.config.add_node_id),
+            "if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
+            "if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
+            "if_add_node_text": self._to_yes_no(self.config.add_node_text),
+        }
+        opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})
+        return page_index_main(self._doc_path, opt)
+
+    def _process_markdown(self) -> Dict[str, Any]:
+        from pageindex.page_index_md import md_to_tree
+        from pageindex.utils import ConfigLoader
+
+        user_opt = {
+            "model": self.config.model,
+            "if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
+            "if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
+            "if_add_node_text": self._to_yes_no(self.config.add_node_text),
+            "if_add_node_id": self._to_yes_no(self.config.add_node_id),
+        }
+        opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})
+
+        return asyncio.run(
+            md_to_tree(
+                md_path=self._doc_path,
+                if_thinning=self.config.if_thinning,
+                min_token_threshold=self.config.thinning_threshold,
+                if_add_node_summary=opt.if_add_node_summary,
+                summary_token_threshold=self.config.summary_token_threshold,
+                model=opt.model,
+                if_add_doc_description=opt.if_add_doc_description,
+                if_add_node_text=opt.if_add_node_text,
+                if_add_node_id=opt.if_add_node_id,
+            )
+        )
+
+    def _build_output_file(self) -> str:
+        base_name = os.path.splitext(os.path.basename(self._doc_path))[0]
+        return os.path.join(self.config.output_dir, f"{base_name}_structure.json")
+
+    def _validate_and_resolve_input(self) -> None:
+        pdf_path = self.config.pdf_path
+        md_path = self.config.md_path
+        if not pdf_path and not md_path:
+            raise ValueError("Either --pdf_path or --md_path must be specified")
+        if pdf_path and md_path:
+            raise ValueError("Only one of --pdf_path or --md_path can be specified")
+
+        if pdf_path:
+            self._validate_pdf(pdf_path)
+            self._doc_kind = "pdf"
+            self._doc_path = pdf_path
+            return
+
+        self._validate_markdown(md_path)
+        self._doc_kind = "markdown"
+        self._doc_path = md_path
+
+    @staticmethod
+    def _validate_pdf(path: str) -> None:
+        if not path.lower().endswith(".pdf"):
+            raise ValueError("PDF file must have .pdf extension")
+        if not os.path.isfile(path):
+            raise ValueError(f"PDF file not found: {path}")
+
+    @staticmethod
+    def _validate_markdown(path: str) -> None:
+        if not path.lower().endswith((".md", ".markdown")):
+            raise ValueError("Markdown file must have .md or .markdown extension")
+        if not os.path.isfile(path):
+            raise ValueError(f"Markdown file not found: {path}")
+
+    def _is_pdf(self) -> bool:
+        return self._doc_kind == "pdf"
+
+    def _is_markdown(self) -> bool:
+        return self._doc_kind == "markdown"
+
+    @staticmethod
+    def _to_yes_no(value: Optional[bool]) -> Optional[str]:
+        if value is None:
+            return None
+        return "yes" if value else "no"
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,23 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pageindex"
+version = "0.1.0"
+description = "Vectorless, reasoning-based RAG with hierarchical document indexing"
+readme = "README.md"
+requires-python = ">=3.9"
+dependencies = [
+  "litellm==1.83.0",
+  "pymupdf==1.26.4",
+  "PyPDF2==3.0.1",
+  "python-dotenv==1.1.0",
+  "pyyaml==6.0.2"
+]
+
+[tool.setuptools.packages.find]
+include = ["pageindex*"]
+
+[tool.setuptools.package-data]
+pageindex = ["config.yaml"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,7 @@ __pycache__ @@
     .env*
     .venv/
     logs/
+    build/
+    dist/
+    *.egg-info/
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from pageindex.pageindex_runner import PageIndex, PageIndexConfig

		__all__ = ["PageIndex", "PageIndexConfig"]