From 3aa69dcefe029251a520fb25ddb459217fafc7c0 Mon Sep 17 00:00:00 2001 From: Chetan Ranpariya Date: Thu, 9 Apr 2026 23:56:27 +0800 Subject: [PATCH 1/5] Introduced support for configuration and entrypoint class. --- PageIndex.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 PageIndex.py diff --git a/PageIndex.py b/PageIndex.py new file mode 100644 index 000000000..9f43130b7 --- /dev/null +++ b/PageIndex.py @@ -0,0 +1,140 @@ +import asyncio +import json +import os +from dataclasses import dataclass +from typing import Any, Dict, Optional + + +@dataclass +class PageIndexConfig: + pdf_path: Optional[str] = None + md_path: Optional[str] = None + model: Optional[str] = None + toc_check_pages: Optional[int] = None + max_pages_per_node: Optional[int] = None + max_tokens_per_node: Optional[int] = None + add_node_id: Optional[bool] = None + add_node_summary: Optional[bool] = None + add_doc_description: Optional[bool] = None + add_node_text: Optional[bool] = None + if_thinning: bool = False + thinning_threshold: int = 5000 + summary_token_threshold: int = 200 + output_dir: str = "./results" + + +class PageIndex: + """Public API for generating structure from PDF or Markdown documents.""" + + def __init__(self, config: PageIndexConfig): + self.config = config + self._doc_kind: Optional[str] = None + self._doc_path: Optional[str] = None + self._validate_and_resolve_input() + + def run(self) -> Dict[str, Any]: + if self._is_pdf(): + return self._process_pdf() + if self._is_markdown(): + return self._process_markdown() + raise ValueError("Input file must be a PDF or Markdown document.") + + def run_and_save(self) -> str: + result = self.run() + output_file = self._build_output_file() + os.makedirs(self.config.output_dir, exist_ok=True) + with open(output_file, "w", encoding="utf-8") as file: + json.dump(result, file, indent=2, ensure_ascii=False) + return output_file + + def _process_pdf(self) -> Dict[str, Any]: + from pageindex.page_index import page_index_main + from pageindex.utils import ConfigLoader + + user_opt = { + "model": self.config.model, + "toc_check_page_num": self.config.toc_check_pages, + "max_page_num_each_node": self.config.max_pages_per_node, + "max_token_num_each_node": self.config.max_tokens_per_node, + "if_add_node_id": self._to_yes_no(self.config.add_node_id), + "if_add_node_summary": self._to_yes_no(self.config.add_node_summary), + "if_add_doc_description": self._to_yes_no(self.config.add_doc_description), + "if_add_node_text": self._to_yes_no(self.config.add_node_text), + } + opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None}) + return page_index_main(self._doc_path, opt) + + def _process_markdown(self) -> Dict[str, Any]: + from pageindex.page_index_md import md_to_tree + from pageindex.utils import ConfigLoader + + user_opt = { + "model": self.config.model, + "if_add_node_summary": self._to_yes_no(self.config.add_node_summary), + "if_add_doc_description": self._to_yes_no(self.config.add_doc_description), + "if_add_node_text": self._to_yes_no(self.config.add_node_text), + "if_add_node_id": self._to_yes_no(self.config.add_node_id), + } + opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None}) + + return asyncio.run( + md_to_tree( + md_path=self._doc_path, + if_thinning=self.config.if_thinning, + min_token_threshold=self.config.thinning_threshold, + if_add_node_summary=opt.if_add_node_summary, + summary_token_threshold=self.config.summary_token_threshold, + model=opt.model, + if_add_doc_description=opt.if_add_doc_description, + if_add_node_text=opt.if_add_node_text, + if_add_node_id=opt.if_add_node_id, + ) + ) + + def _build_output_file(self) -> str: + base_name = os.path.splitext(os.path.basename(self._doc_path))[0] + return os.path.join(self.config.output_dir, f"{base_name}_structure.json") + + def _validate_and_resolve_input(self) -> None: + pdf_path = self.config.pdf_path + md_path = self.config.md_path + if not pdf_path and not md_path: + raise ValueError("Either --pdf_path or --md_path must be specified") + if pdf_path and md_path: + raise ValueError("Only one of --pdf_path or --md_path can be specified") + + if pdf_path: + self._validate_pdf(pdf_path) + self._doc_kind = "pdf" + self._doc_path = pdf_path + return + + self._validate_markdown(md_path) + self._doc_kind = "markdown" + self._doc_path = md_path + + @staticmethod + def _validate_pdf(path: str) -> None: + if not path.lower().endswith(".pdf"): + raise ValueError("PDF file must have .pdf extension") + if not os.path.isfile(path): + raise ValueError(f"PDF file not found: {path}") + + @staticmethod + def _validate_markdown(path: str) -> None: + if not path.lower().endswith((".md", ".markdown")): + raise ValueError("Markdown file must have .md or .markdown extension") + if not os.path.isfile(path): + raise ValueError(f"Markdown file not found: {path}") + + def _is_pdf(self) -> bool: + return self._doc_kind == "pdf" + + def _is_markdown(self) -> bool: + return self._doc_kind == "markdown" + + @staticmethod + def _to_yes_no(value: Optional[bool]) -> Optional[str]: + if value is None: + return None + return "yes" if value else "no" From 108dc05439707cd5ecc35bf28bb15d434a67b795 Mon Sep 17 00:00:00 2001 From: Chetan Ranpariya Date: Fri, 10 Apr 2026 00:07:47 +0800 Subject: [PATCH 2/5] Added unit tests. --- tests/test_pageindex_unit.py | 143 +++++++++++++++++++++++++++++ tests/test_pageindex_validation.py | 58 ++++++++++++ 2 files changed, 201 insertions(+) create mode 100644 tests/test_pageindex_unit.py create mode 100644 tests/test_pageindex_validation.py diff --git a/tests/test_pageindex_unit.py b/tests/test_pageindex_unit.py new file mode 100644 index 000000000..1fa3500e8 --- /dev/null +++ b/tests/test_pageindex_unit.py @@ -0,0 +1,143 @@ +import os +import sys +import tempfile +import types +import unittest +from unittest import mock + +from PageIndex import PageIndex, PageIndexConfig + + +class DummyOpt: + def __init__(self, data): + self.model = data.get("model") + self.if_add_node_summary = data.get("if_add_node_summary", "no") + self.if_add_doc_description = data.get("if_add_doc_description", "no") + self.if_add_node_text = data.get("if_add_node_text", "no") + self.if_add_node_id = data.get("if_add_node_id", "yes") + + +class TestPageIndexUnit(unittest.TestCase): + def _temp_file(self, suffix: str) -> str: + fd, path = tempfile.mkstemp(suffix=suffix) + os.close(fd) + self.addCleanup(lambda: os.path.exists(path) and os.remove(path)) + return path + + def _stub_modules(self): + calls = { + "config_load_input": None, + "page_index_main": None, + "md_to_tree": None, + } + + page_index_module = types.ModuleType("pageindex.page_index") + + def fake_page_index_main(doc, opt): + calls["page_index_main"] = {"doc": doc, "opt": opt} + return {"doc_name": "stub-pdf", "structure": []} + + page_index_module.page_index_main = fake_page_index_main + + page_index_md_module = types.ModuleType("pageindex.page_index_md") + + async def fake_md_to_tree(**kwargs): + calls["md_to_tree"] = kwargs + return {"doc_name": "stub-md", "line_count": 1, "structure": []} + + page_index_md_module.md_to_tree = fake_md_to_tree + + utils_module = types.ModuleType("pageindex.utils") + + class FakeConfigLoader: + def load(self, data): + calls["config_load_input"] = data + return DummyOpt(data) + + utils_module.ConfigLoader = FakeConfigLoader + + return calls, { + "pageindex.page_index": page_index_module, + "pageindex.page_index_md": page_index_md_module, + "pageindex.utils": utils_module, + } + + def test_pdf_run_dispatches_and_maps_options(self): + pdf_path = self._temp_file(".pdf") + calls, stubbed = self._stub_modules() + config = PageIndexConfig( + pdf_path=pdf_path, + model="test-model", + toc_check_pages=12, + max_pages_per_node=7, + max_tokens_per_node=1234, + add_node_id=True, + add_node_summary=False, + add_doc_description=True, + add_node_text=False, + ) + + with mock.patch.dict(sys.modules, stubbed): + page_index = PageIndex(config) + result = page_index.run() + + self.assertEqual(result["doc_name"], "stub-pdf") + self.assertEqual(calls["page_index_main"]["doc"], pdf_path) + self.assertEqual(calls["config_load_input"]["model"], "test-model") + self.assertEqual(calls["config_load_input"]["toc_check_page_num"], 12) + self.assertEqual(calls["config_load_input"]["max_page_num_each_node"], 7) + self.assertEqual(calls["config_load_input"]["max_token_num_each_node"], 1234) + self.assertEqual(calls["config_load_input"]["if_add_node_id"], "yes") + self.assertEqual(calls["config_load_input"]["if_add_node_summary"], "no") + self.assertEqual(calls["config_load_input"]["if_add_doc_description"], "yes") + self.assertEqual(calls["config_load_input"]["if_add_node_text"], "no") + + def test_markdown_run_dispatches_and_maps_options(self): + md_path = self._temp_file(".md") + calls, stubbed = self._stub_modules() + config = PageIndexConfig( + md_path=md_path, + model="test-model", + if_thinning=True, + thinning_threshold=777, + summary_token_threshold=333, + add_node_id=False, + add_node_summary=True, + add_doc_description=False, + add_node_text=True, + ) + + with mock.patch.dict(sys.modules, stubbed): + page_index = PageIndex(config) + result = page_index.run() + + self.assertEqual(result["doc_name"], "stub-md") + self.assertEqual(calls["config_load_input"]["model"], "test-model") + self.assertEqual(calls["config_load_input"]["if_add_node_id"], "no") + self.assertEqual(calls["config_load_input"]["if_add_node_summary"], "yes") + self.assertEqual(calls["config_load_input"]["if_add_doc_description"], "no") + self.assertEqual(calls["config_load_input"]["if_add_node_text"], "yes") + self.assertEqual(calls["md_to_tree"]["md_path"], md_path) + self.assertTrue(calls["md_to_tree"]["if_thinning"]) + self.assertEqual(calls["md_to_tree"]["min_token_threshold"], 777) + self.assertEqual(calls["md_to_tree"]["summary_token_threshold"], 333) + + def test_run_and_save_writes_expected_output_file(self): + md_path = self._temp_file(".md") + output_dir = tempfile.mkdtemp() + self.addCleanup(lambda: os.path.isdir(output_dir) and os.rmdir(output_dir)) + calls, stubbed = self._stub_modules() + config = PageIndexConfig(md_path=md_path, output_dir=output_dir) + + with mock.patch.dict(sys.modules, stubbed): + page_index = PageIndex(config) + output_file = page_index.run_and_save() + + self.addCleanup(lambda: os.path.exists(output_file) and os.remove(output_file)) + self.assertTrue(os.path.exists(output_file)) + self.assertTrue(output_file.endswith("_structure.json")) + self.assertIn(os.path.basename(output_dir), output_file) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_pageindex_validation.py b/tests/test_pageindex_validation.py new file mode 100644 index 000000000..7840bb331 --- /dev/null +++ b/tests/test_pageindex_validation.py @@ -0,0 +1,58 @@ +import os +import tempfile +import unittest + +from PageIndex import PageIndex, PageIndexConfig + + +class TestPageIndexValidationParity(unittest.TestCase): + def _temp_file(self, suffix: str) -> str: + fd, path = tempfile.mkstemp(suffix=suffix) + os.close(fd) + self.addCleanup(lambda: os.path.exists(path) and os.remove(path)) + return path + + def test_requires_either_pdf_or_markdown_path(self): + with self.assertRaisesRegex(ValueError, "Either --pdf_path or --md_path must be specified"): + PageIndex(PageIndexConfig()) + + def test_rejects_both_pdf_and_markdown_path(self): + pdf_path = self._temp_file(".pdf") + md_path = self._temp_file(".md") + with self.assertRaisesRegex(ValueError, "Only one of --pdf_path or --md_path can be specified"): + PageIndex(PageIndexConfig(pdf_path=pdf_path, md_path=md_path)) + + def test_pdf_requires_pdf_extension(self): + wrong_extension = self._temp_file(".txt") + with self.assertRaisesRegex(ValueError, "PDF file must have .pdf extension"): + PageIndex(PageIndexConfig(pdf_path=wrong_extension)) + + def test_pdf_requires_existing_file(self): + missing_pdf = os.path.join(tempfile.gettempdir(), "missing_pageindex_input.pdf") + with self.assertRaisesRegex(ValueError, f"PDF file not found: {missing_pdf}"): + PageIndex(PageIndexConfig(pdf_path=missing_pdf)) + + def test_markdown_requires_markdown_extension(self): + wrong_extension = self._temp_file(".txt") + with self.assertRaisesRegex(ValueError, "Markdown file must have .md or .markdown extension"): + PageIndex(PageIndexConfig(md_path=wrong_extension)) + + def test_markdown_requires_existing_file(self): + missing_md = os.path.join(tempfile.gettempdir(), "missing_pageindex_input.md") + with self.assertRaisesRegex(ValueError, f"Markdown file not found: {missing_md}"): + PageIndex(PageIndexConfig(md_path=missing_md)) + + def test_accepts_valid_pdf_path(self): + pdf_path = self._temp_file(".pdf") + page_index = PageIndex(PageIndexConfig(pdf_path=pdf_path)) + self.assertEqual(page_index._doc_kind, "pdf") + self.assertEqual(page_index._doc_path, pdf_path) + + def test_accepts_valid_markdown_path(self): + md_path = self._temp_file(".md") + page_index = PageIndex(PageIndexConfig(md_path=md_path)) + self.assertEqual(page_index._doc_kind, "markdown") + self.assertEqual(page_index._doc_path, md_path) + +if __name__ == "__main__": + unittest.main() From 3cb3d7c253d59efc14ecdab5d4447bfde8e43dda Mon Sep 17 00:00:00 2001 From: Chetan Ranpariya Date: Fri, 10 Apr 2026 00:29:20 +0800 Subject: [PATCH 3/5] Updated .gitignore file to ignore build outputs. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 23d6b5655..4fc175a2a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ __pycache__ .env* .venv/ logs/ + +build/ +dist/ +*.egg-info/ \ No newline at end of file From cbf33f2319df87c31906703a9d1e04bf88fd58fd Mon Sep 17 00:00:00 2001 From: Chetan Ranpariya Date: Fri, 10 Apr 2026 00:29:47 +0800 Subject: [PATCH 4/5] Rearranged code to make new classes exportable. --- PageIndex.py | 141 +---------------------------- pageindex/__init__.py | 15 ++- pageindex/pageindex_runner.py | 140 ++++++++++++++++++++++++++++ pyproject.toml | 23 +++++ tests/test_pageindex_unit.py | 2 +- tests/test_pageindex_validation.py | 2 +- 6 files changed, 178 insertions(+), 145 deletions(-) create mode 100644 pageindex/pageindex_runner.py create mode 100644 pyproject.toml diff --git a/PageIndex.py b/PageIndex.py index 9f43130b7..978093329 100644 --- a/PageIndex.py +++ b/PageIndex.py @@ -1,140 +1,3 @@ -import asyncio -import json -import os -from dataclasses import dataclass -from typing import Any, Dict, Optional +from pageindex.pageindex_runner import PageIndex, PageIndexConfig - -@dataclass -class PageIndexConfig: - pdf_path: Optional[str] = None - md_path: Optional[str] = None - model: Optional[str] = None - toc_check_pages: Optional[int] = None - max_pages_per_node: Optional[int] = None - max_tokens_per_node: Optional[int] = None - add_node_id: Optional[bool] = None - add_node_summary: Optional[bool] = None - add_doc_description: Optional[bool] = None - add_node_text: Optional[bool] = None - if_thinning: bool = False - thinning_threshold: int = 5000 - summary_token_threshold: int = 200 - output_dir: str = "./results" - - -class PageIndex: - """Public API for generating structure from PDF or Markdown documents.""" - - def __init__(self, config: PageIndexConfig): - self.config = config - self._doc_kind: Optional[str] = None - self._doc_path: Optional[str] = None - self._validate_and_resolve_input() - - def run(self) -> Dict[str, Any]: - if self._is_pdf(): - return self._process_pdf() - if self._is_markdown(): - return self._process_markdown() - raise ValueError("Input file must be a PDF or Markdown document.") - - def run_and_save(self) -> str: - result = self.run() - output_file = self._build_output_file() - os.makedirs(self.config.output_dir, exist_ok=True) - with open(output_file, "w", encoding="utf-8") as file: - json.dump(result, file, indent=2, ensure_ascii=False) - return output_file - - def _process_pdf(self) -> Dict[str, Any]: - from pageindex.page_index import page_index_main - from pageindex.utils import ConfigLoader - - user_opt = { - "model": self.config.model, - "toc_check_page_num": self.config.toc_check_pages, - "max_page_num_each_node": self.config.max_pages_per_node, - "max_token_num_each_node": self.config.max_tokens_per_node, - "if_add_node_id": self._to_yes_no(self.config.add_node_id), - "if_add_node_summary": self._to_yes_no(self.config.add_node_summary), - "if_add_doc_description": self._to_yes_no(self.config.add_doc_description), - "if_add_node_text": self._to_yes_no(self.config.add_node_text), - } - opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None}) - return page_index_main(self._doc_path, opt) - - def _process_markdown(self) -> Dict[str, Any]: - from pageindex.page_index_md import md_to_tree - from pageindex.utils import ConfigLoader - - user_opt = { - "model": self.config.model, - "if_add_node_summary": self._to_yes_no(self.config.add_node_summary), - "if_add_doc_description": self._to_yes_no(self.config.add_doc_description), - "if_add_node_text": self._to_yes_no(self.config.add_node_text), - "if_add_node_id": self._to_yes_no(self.config.add_node_id), - } - opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None}) - - return asyncio.run( - md_to_tree( - md_path=self._doc_path, - if_thinning=self.config.if_thinning, - min_token_threshold=self.config.thinning_threshold, - if_add_node_summary=opt.if_add_node_summary, - summary_token_threshold=self.config.summary_token_threshold, - model=opt.model, - if_add_doc_description=opt.if_add_doc_description, - if_add_node_text=opt.if_add_node_text, - if_add_node_id=opt.if_add_node_id, - ) - ) - - def _build_output_file(self) -> str: - base_name = os.path.splitext(os.path.basename(self._doc_path))[0] - return os.path.join(self.config.output_dir, f"{base_name}_structure.json") - - def _validate_and_resolve_input(self) -> None: - pdf_path = self.config.pdf_path - md_path = self.config.md_path - if not pdf_path and not md_path: - raise ValueError("Either --pdf_path or --md_path must be specified") - if pdf_path and md_path: - raise ValueError("Only one of --pdf_path or --md_path can be specified") - - if pdf_path: - self._validate_pdf(pdf_path) - self._doc_kind = "pdf" - self._doc_path = pdf_path - return - - self._validate_markdown(md_path) - self._doc_kind = "markdown" - self._doc_path = md_path - - @staticmethod - def _validate_pdf(path: str) -> None: - if not path.lower().endswith(".pdf"): - raise ValueError("PDF file must have .pdf extension") - if not os.path.isfile(path): - raise ValueError(f"PDF file not found: {path}") - - @staticmethod - def _validate_markdown(path: str) -> None: - if not path.lower().endswith((".md", ".markdown")): - raise ValueError("Markdown file must have .md or .markdown extension") - if not os.path.isfile(path): - raise ValueError(f"Markdown file not found: {path}") - - def _is_pdf(self) -> bool: - return self._doc_kind == "pdf" - - def _is_markdown(self) -> bool: - return self._doc_kind == "markdown" - - @staticmethod - def _to_yes_no(value: Optional[bool]) -> Optional[str]: - if value is None: - return None - return "yes" if value else "no" +__all__ = ["PageIndex", "PageIndexConfig"] diff --git a/pageindex/__init__.py b/pageindex/__init__.py index 658003bf5..dff6f8fc1 100644 --- a/pageindex/__init__.py +++ b/pageindex/__init__.py @@ -1,4 +1,11 @@ -from .page_index import * -from .page_index_md import md_to_tree -from .retrieve import get_document, get_document_structure, get_page_content -from .client import PageIndexClient +from .pageindex_runner import PageIndex, PageIndexConfig + +try: + from .page_index import * + from .page_index_md import md_to_tree + from .retrieve import get_document, get_document_structure, get_page_content + from .client import PageIndexClient +except ModuleNotFoundError: + # Allows importing lightweight APIs (e.g. PageIndex config/validation) + # before runtime dependencies are installed. + pass diff --git a/pageindex/pageindex_runner.py b/pageindex/pageindex_runner.py new file mode 100644 index 000000000..9f43130b7 --- /dev/null +++ b/pageindex/pageindex_runner.py @@ -0,0 +1,140 @@ +import asyncio +import json +import os +from dataclasses import dataclass +from typing import Any, Dict, Optional + + +@dataclass +class PageIndexConfig: + pdf_path: Optional[str] = None + md_path: Optional[str] = None + model: Optional[str] = None + toc_check_pages: Optional[int] = None + max_pages_per_node: Optional[int] = None + max_tokens_per_node: Optional[int] = None + add_node_id: Optional[bool] = None + add_node_summary: Optional[bool] = None + add_doc_description: Optional[bool] = None + add_node_text: Optional[bool] = None + if_thinning: bool = False + thinning_threshold: int = 5000 + summary_token_threshold: int = 200 + output_dir: str = "./results" + + +class PageIndex: + """Public API for generating structure from PDF or Markdown documents.""" + + def __init__(self, config: PageIndexConfig): + self.config = config + self._doc_kind: Optional[str] = None + self._doc_path: Optional[str] = None + self._validate_and_resolve_input() + + def run(self) -> Dict[str, Any]: + if self._is_pdf(): + return self._process_pdf() + if self._is_markdown(): + return self._process_markdown() + raise ValueError("Input file must be a PDF or Markdown document.") + + def run_and_save(self) -> str: + result = self.run() + output_file = self._build_output_file() + os.makedirs(self.config.output_dir, exist_ok=True) + with open(output_file, "w", encoding="utf-8") as file: + json.dump(result, file, indent=2, ensure_ascii=False) + return output_file + + def _process_pdf(self) -> Dict[str, Any]: + from pageindex.page_index import page_index_main + from pageindex.utils import ConfigLoader + + user_opt = { + "model": self.config.model, + "toc_check_page_num": self.config.toc_check_pages, + "max_page_num_each_node": self.config.max_pages_per_node, + "max_token_num_each_node": self.config.max_tokens_per_node, + "if_add_node_id": self._to_yes_no(self.config.add_node_id), + "if_add_node_summary": self._to_yes_no(self.config.add_node_summary), + "if_add_doc_description": self._to_yes_no(self.config.add_doc_description), + "if_add_node_text": self._to_yes_no(self.config.add_node_text), + } + opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None}) + return page_index_main(self._doc_path, opt) + + def _process_markdown(self) -> Dict[str, Any]: + from pageindex.page_index_md import md_to_tree + from pageindex.utils import ConfigLoader + + user_opt = { + "model": self.config.model, + "if_add_node_summary": self._to_yes_no(self.config.add_node_summary), + "if_add_doc_description": self._to_yes_no(self.config.add_doc_description), + "if_add_node_text": self._to_yes_no(self.config.add_node_text), + "if_add_node_id": self._to_yes_no(self.config.add_node_id), + } + opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None}) + + return asyncio.run( + md_to_tree( + md_path=self._doc_path, + if_thinning=self.config.if_thinning, + min_token_threshold=self.config.thinning_threshold, + if_add_node_summary=opt.if_add_node_summary, + summary_token_threshold=self.config.summary_token_threshold, + model=opt.model, + if_add_doc_description=opt.if_add_doc_description, + if_add_node_text=opt.if_add_node_text, + if_add_node_id=opt.if_add_node_id, + ) + ) + + def _build_output_file(self) -> str: + base_name = os.path.splitext(os.path.basename(self._doc_path))[0] + return os.path.join(self.config.output_dir, f"{base_name}_structure.json") + + def _validate_and_resolve_input(self) -> None: + pdf_path = self.config.pdf_path + md_path = self.config.md_path + if not pdf_path and not md_path: + raise ValueError("Either --pdf_path or --md_path must be specified") + if pdf_path and md_path: + raise ValueError("Only one of --pdf_path or --md_path can be specified") + + if pdf_path: + self._validate_pdf(pdf_path) + self._doc_kind = "pdf" + self._doc_path = pdf_path + return + + self._validate_markdown(md_path) + self._doc_kind = "markdown" + self._doc_path = md_path + + @staticmethod + def _validate_pdf(path: str) -> None: + if not path.lower().endswith(".pdf"): + raise ValueError("PDF file must have .pdf extension") + if not os.path.isfile(path): + raise ValueError(f"PDF file not found: {path}") + + @staticmethod + def _validate_markdown(path: str) -> None: + if not path.lower().endswith((".md", ".markdown")): + raise ValueError("Markdown file must have .md or .markdown extension") + if not os.path.isfile(path): + raise ValueError(f"Markdown file not found: {path}") + + def _is_pdf(self) -> bool: + return self._doc_kind == "pdf" + + def _is_markdown(self) -> bool: + return self._doc_kind == "markdown" + + @staticmethod + def _to_yes_no(value: Optional[bool]) -> Optional[str]: + if value is None: + return None + return "yes" if value else "no" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..4a725f820 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "pageindex" +version = "0.1.0" +description = "Vectorless, reasoning-based RAG with hierarchical document indexing" +readme = "README.md" +requires-python = ">=3.9" +dependencies = [ + "litellm==1.83.0", + "pymupdf==1.26.4", + "PyPDF2==3.0.1", + "python-dotenv==1.1.0", + "pyyaml==6.0.2" +] + +[tool.setuptools.packages.find] +include = ["pageindex*"] + +[tool.setuptools.package-data] +pageindex = ["config.yaml"] diff --git a/tests/test_pageindex_unit.py b/tests/test_pageindex_unit.py index 1fa3500e8..b21011de4 100644 --- a/tests/test_pageindex_unit.py +++ b/tests/test_pageindex_unit.py @@ -5,7 +5,7 @@ import unittest from unittest import mock -from PageIndex import PageIndex, PageIndexConfig +from pageindex import PageIndex, PageIndexConfig class DummyOpt: diff --git a/tests/test_pageindex_validation.py b/tests/test_pageindex_validation.py index 7840bb331..67044edc0 100644 --- a/tests/test_pageindex_validation.py +++ b/tests/test_pageindex_validation.py @@ -2,7 +2,7 @@ import tempfile import unittest -from PageIndex import PageIndex, PageIndexConfig +from pageindex import PageIndex, PageIndexConfig class TestPageIndexValidationParity(unittest.TestCase): From 42254481a0b3b79f1f93588ef435ddebf26f9002 Mon Sep 17 00:00:00 2001 From: Chetan Ranpariya Date: Fri, 10 Apr 2026 00:34:23 +0800 Subject: [PATCH 5/5] Updated README.md file with new instructions of installing and using as module. --- README.md | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a85fbd01d..59d557e2d 100644 --- a/README.md +++ b/README.md @@ -141,23 +141,68 @@ You can generate the PageIndex tree structure with this open-source repo, or use # ⚙️ Package Usage -You can follow these steps to generate a PageIndex tree from a PDF document. +You can install and use PageIndex either as a Python package (class API) or via CLI. -### 1. Install dependencies +### 1. Install + +From source: + +```bash +pip3 install . +``` + +For development (editable install): ```bash -pip3 install --upgrade -r requirements.txt +pip3 install -e . ``` ### 2. Set your LLM API key -Create a `.env` file in the root directory with your LLM API key, with multi-LLM support via [LiteLLM](https://docs.litellm.ai/docs/providers): +Create a `.env` file in your working directory with your LLM API key, with multi-LLM support via [LiteLLM](https://docs.litellm.ai/docs/providers): ```bash OPENAI_API_KEY=your_openai_key_here ``` -### 3. Generate PageIndex structure for your PDF +### 3. Use the class API (recommended for integration) + +```python +from pageindex import PageIndex, PageIndexConfig + +config = PageIndexConfig( + pdf_path="/path/to/your/document.pdf", + model="gpt-4o-2024-11-20", # optional + add_node_summary=True, # optional + add_doc_description=True, # optional + add_node_text=False, # optional + add_node_id=True, # optional + output_dir="./results" # optional +) + +# Get result in memory +result = PageIndex(config).run() + +# Or run and save as JSON +output_file = PageIndex(config).run_and_save() +print(output_file) +``` + +Markdown input is also supported: + +```python +from pageindex import PageIndex, PageIndexConfig + +config = PageIndexConfig( + md_path="/path/to/your/document.md", + if_thinning=False, # optional + thinning_threshold=5000, # optional + summary_token_threshold=200, # optional +) +result = PageIndex(config).run() +``` + +### 4. Use the CLI ```bash python3 run_pageindex.py --pdf_path /path/to/your/document.pdf