From 3aa69dcefe029251a520fb25ddb459217fafc7c0 Mon Sep 17 00:00:00 2001
From: Chetan Ranpariya <ranpariyachetan@users.noreply.github.com>
Date: Thu, 9 Apr 2026 23:56:27 +0800
Subject: [PATCH 1/5] Introduced support for configuration and entrypoint
 class.

---
 PageIndex.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 PageIndex.py

diff --git a/PageIndex.py b/PageIndex.py
new file mode 100644
index 000000000..9f43130b7
--- /dev/null
+++ b/PageIndex.py
@@ -0,0 +1,140 @@
+import asyncio
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class PageIndexConfig:
+    pdf_path: Optional[str] = None
+    md_path: Optional[str] = None
+    model: Optional[str] = None
+    toc_check_pages: Optional[int] = None
+    max_pages_per_node: Optional[int] = None
+    max_tokens_per_node: Optional[int] = None
+    add_node_id: Optional[bool] = None
+    add_node_summary: Optional[bool] = None
+    add_doc_description: Optional[bool] = None
+    add_node_text: Optional[bool] = None
+    if_thinning: bool = False
+    thinning_threshold: int = 5000
+    summary_token_threshold: int = 200
+    output_dir: str = "./results"
+
+
+class PageIndex:
+    """Public API for generating structure from PDF or Markdown documents."""
+
+    def __init__(self, config: PageIndexConfig):
+        self.config = config
+        self._doc_kind: Optional[str] = None
+        self._doc_path: Optional[str] = None
+        self._validate_and_resolve_input()
+
+    def run(self) -> Dict[str, Any]:
+        if self._is_pdf():
+            return self._process_pdf()
+        if self._is_markdown():
+            return self._process_markdown()
+        raise ValueError("Input file must be a PDF or Markdown document.")
+
+    def run_and_save(self) -> str:
+        result = self.run()
+        output_file = self._build_output_file()
+        os.makedirs(self.config.output_dir, exist_ok=True)
+        with open(output_file, "w", encoding="utf-8") as file:
+            json.dump(result, file, indent=2, ensure_ascii=False)
+        return output_file
+
+    def _process_pdf(self) -> Dict[str, Any]:
+        from pageindex.page_index import page_index_main
+        from pageindex.utils import ConfigLoader
+
+        user_opt = {
+            "model": self.config.model,
+            "toc_check_page_num": self.config.toc_check_pages,
+            "max_page_num_each_node": self.config.max_pages_per_node,
+            "max_token_num_each_node": self.config.max_tokens_per_node,
+            "if_add_node_id": self._to_yes_no(self.config.add_node_id),
+            "if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
+            "if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
+            "if_add_node_text": self._to_yes_no(self.config.add_node_text),
+        }
+        opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})
+        return page_index_main(self._doc_path, opt)
+
+    def _process_markdown(self) -> Dict[str, Any]:
+        from pageindex.page_index_md import md_to_tree
+        from pageindex.utils import ConfigLoader
+
+        user_opt = {
+            "model": self.config.model,
+            "if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
+            "if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
+            "if_add_node_text": self._to_yes_no(self.config.add_node_text),
+            "if_add_node_id": self._to_yes_no(self.config.add_node_id),
+        }
+        opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})
+
+        return asyncio.run(
+            md_to_tree(
+                md_path=self._doc_path,
+                if_thinning=self.config.if_thinning,
+                min_token_threshold=self.config.thinning_threshold,
+                if_add_node_summary=opt.if_add_node_summary,
+                summary_token_threshold=self.config.summary_token_threshold,
+                model=opt.model,
+                if_add_doc_description=opt.if_add_doc_description,
+                if_add_node_text=opt.if_add_node_text,
+                if_add_node_id=opt.if_add_node_id,
+            )
+        )
+
+    def _build_output_file(self) -> str:
+        base_name = os.path.splitext(os.path.basename(self._doc_path))[0]
+        return os.path.join(self.config.output_dir, f"{base_name}_structure.json")
+
+    def _validate_and_resolve_input(self) -> None:
+        pdf_path = self.config.pdf_path
+        md_path = self.config.md_path
+        if not pdf_path and not md_path:
+            raise ValueError("Either --pdf_path or --md_path must be specified")
+        if pdf_path and md_path:
+            raise ValueError("Only one of --pdf_path or --md_path can be specified")
+
+        if pdf_path:
+            self._validate_pdf(pdf_path)
+            self._doc_kind = "pdf"
+            self._doc_path = pdf_path
+            return
+
+        self._validate_markdown(md_path)
+        self._doc_kind = "markdown"
+        self._doc_path = md_path
+
+    @staticmethod
+    def _validate_pdf(path: str) -> None:
+        if not path.lower().endswith(".pdf"):
+            raise ValueError("PDF file must have .pdf extension")
+        if not os.path.isfile(path):
+            raise ValueError(f"PDF file not found: {path}")
+
+    @staticmethod
+    def _validate_markdown(path: str) -> None:
+        if not path.lower().endswith((".md", ".markdown")):
+            raise ValueError("Markdown file must have .md or .markdown extension")
+        if not os.path.isfile(path):
+            raise ValueError(f"Markdown file not found: {path}")
+
+    def _is_pdf(self) -> bool:
+        return self._doc_kind == "pdf"
+
+    def _is_markdown(self) -> bool:
+        return self._doc_kind == "markdown"
+
+    @staticmethod
+    def _to_yes_no(value: Optional[bool]) -> Optional[str]:
+        if value is None:
+            return None
+        return "yes" if value else "no"

From 108dc05439707cd5ecc35bf28bb15d434a67b795 Mon Sep 17 00:00:00 2001
From: Chetan Ranpariya <ranpariyachetan@users.noreply.github.com>
Date: Fri, 10 Apr 2026 00:07:47 +0800
Subject: [PATCH 2/5] Added unit tests.

---
 tests/test_pageindex_unit.py       | 143 +++++++++++++++++++++++++++++
 tests/test_pageindex_validation.py |  58 ++++++++++++
 2 files changed, 201 insertions(+)
 create mode 100644 tests/test_pageindex_unit.py
 create mode 100644 tests/test_pageindex_validation.py

diff --git a/tests/test_pageindex_unit.py b/tests/test_pageindex_unit.py
new file mode 100644
index 000000000..1fa3500e8
--- /dev/null
+++ b/tests/test_pageindex_unit.py
@@ -0,0 +1,143 @@
+import os
+import sys
+import tempfile
+import types
+import unittest
+from unittest import mock
+
+from PageIndex import PageIndex, PageIndexConfig
+
+
+class DummyOpt:
+    def __init__(self, data):
+        self.model = data.get("model")
+        self.if_add_node_summary = data.get("if_add_node_summary", "no")
+        self.if_add_doc_description = data.get("if_add_doc_description", "no")
+        self.if_add_node_text = data.get("if_add_node_text", "no")
+        self.if_add_node_id = data.get("if_add_node_id", "yes")
+
+
+class TestPageIndexUnit(unittest.TestCase):
+    def _temp_file(self, suffix: str) -> str:
+        fd, path = tempfile.mkstemp(suffix=suffix)
+        os.close(fd)
+        self.addCleanup(lambda: os.path.exists(path) and os.remove(path))
+        return path
+
+    def _stub_modules(self):
+        calls = {
+            "config_load_input": None,
+            "page_index_main": None,
+            "md_to_tree": None,
+        }
+
+        page_index_module = types.ModuleType("pageindex.page_index")
+
+        def fake_page_index_main(doc, opt):
+            calls["page_index_main"] = {"doc": doc, "opt": opt}
+            return {"doc_name": "stub-pdf", "structure": []}
+
+        page_index_module.page_index_main = fake_page_index_main
+
+        page_index_md_module = types.ModuleType("pageindex.page_index_md")
+
+        async def fake_md_to_tree(**kwargs):
+            calls["md_to_tree"] = kwargs
+            return {"doc_name": "stub-md", "line_count": 1, "structure": []}
+
+        page_index_md_module.md_to_tree = fake_md_to_tree
+
+        utils_module = types.ModuleType("pageindex.utils")
+
+        class FakeConfigLoader:
+            def load(self, data):
+                calls["config_load_input"] = data
+                return DummyOpt(data)
+
+        utils_module.ConfigLoader = FakeConfigLoader
+
+        return calls, {
+            "pageindex.page_index": page_index_module,
+            "pageindex.page_index_md": page_index_md_module,
+            "pageindex.utils": utils_module,
+        }
+
+    def test_pdf_run_dispatches_and_maps_options(self):
+        pdf_path = self._temp_file(".pdf")
+        calls, stubbed = self._stub_modules()
+        config = PageIndexConfig(
+            pdf_path=pdf_path,
+            model="test-model",
+            toc_check_pages=12,
+            max_pages_per_node=7,
+            max_tokens_per_node=1234,
+            add_node_id=True,
+            add_node_summary=False,
+            add_doc_description=True,
+            add_node_text=False,
+        )
+
+        with mock.patch.dict(sys.modules, stubbed):
+            page_index = PageIndex(config)
+            result = page_index.run()
+
+        self.assertEqual(result["doc_name"], "stub-pdf")
+        self.assertEqual(calls["page_index_main"]["doc"], pdf_path)
+        self.assertEqual(calls["config_load_input"]["model"], "test-model")
+        self.assertEqual(calls["config_load_input"]["toc_check_page_num"], 12)
+        self.assertEqual(calls["config_load_input"]["max_page_num_each_node"], 7)
+        self.assertEqual(calls["config_load_input"]["max_token_num_each_node"], 1234)
+        self.assertEqual(calls["config_load_input"]["if_add_node_id"], "yes")
+        self.assertEqual(calls["config_load_input"]["if_add_node_summary"], "no")
+        self.assertEqual(calls["config_load_input"]["if_add_doc_description"], "yes")
+        self.assertEqual(calls["config_load_input"]["if_add_node_text"], "no")
+
+    def test_markdown_run_dispatches_and_maps_options(self):
+        md_path = self._temp_file(".md")
+        calls, stubbed = self._stub_modules()
+        config = PageIndexConfig(
+            md_path=md_path,
+            model="test-model",
+            if_thinning=True,
+            thinning_threshold=777,
+            summary_token_threshold=333,
+            add_node_id=False,
+            add_node_summary=True,
+            add_doc_description=False,
+            add_node_text=True,
+        )
+
+        with mock.patch.dict(sys.modules, stubbed):
+            page_index = PageIndex(config)
+            result = page_index.run()
+
+        self.assertEqual(result["doc_name"], "stub-md")
+        self.assertEqual(calls["config_load_input"]["model"], "test-model")
+        self.assertEqual(calls["config_load_input"]["if_add_node_id"], "no")
+        self.assertEqual(calls["config_load_input"]["if_add_node_summary"], "yes")
+        self.assertEqual(calls["config_load_input"]["if_add_doc_description"], "no")
+        self.assertEqual(calls["config_load_input"]["if_add_node_text"], "yes")
+        self.assertEqual(calls["md_to_tree"]["md_path"], md_path)
+        self.assertTrue(calls["md_to_tree"]["if_thinning"])
+        self.assertEqual(calls["md_to_tree"]["min_token_threshold"], 777)
+        self.assertEqual(calls["md_to_tree"]["summary_token_threshold"], 333)
+
+    def test_run_and_save_writes_expected_output_file(self):
+        md_path = self._temp_file(".md")
+        output_dir = tempfile.mkdtemp()
+        self.addCleanup(lambda: os.path.isdir(output_dir) and os.rmdir(output_dir))
+        calls, stubbed = self._stub_modules()
+        config = PageIndexConfig(md_path=md_path, output_dir=output_dir)
+
+        with mock.patch.dict(sys.modules, stubbed):
+            page_index = PageIndex(config)
+            output_file = page_index.run_and_save()
+
+        self.addCleanup(lambda: os.path.exists(output_file) and os.remove(output_file))
+        self.assertTrue(os.path.exists(output_file))
+        self.assertTrue(output_file.endswith("_structure.json"))
+        self.assertIn(os.path.basename(output_dir), output_file)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_pageindex_validation.py b/tests/test_pageindex_validation.py
new file mode 100644
index 000000000..7840bb331
--- /dev/null
+++ b/tests/test_pageindex_validation.py
@@ -0,0 +1,58 @@
+import os
+import tempfile
+import unittest
+
+from PageIndex import PageIndex, PageIndexConfig
+
+
+class TestPageIndexValidationParity(unittest.TestCase):
+    def _temp_file(self, suffix: str) -> str:
+        fd, path = tempfile.mkstemp(suffix=suffix)
+        os.close(fd)
+        self.addCleanup(lambda: os.path.exists(path) and os.remove(path))
+        return path
+
+    def test_requires_either_pdf_or_markdown_path(self):
+        with self.assertRaisesRegex(ValueError, "Either --pdf_path or --md_path must be specified"):
+            PageIndex(PageIndexConfig())
+
+    def test_rejects_both_pdf_and_markdown_path(self):
+        pdf_path = self._temp_file(".pdf")
+        md_path = self._temp_file(".md")
+        with self.assertRaisesRegex(ValueError, "Only one of --pdf_path or --md_path can be specified"):
+            PageIndex(PageIndexConfig(pdf_path=pdf_path, md_path=md_path))
+
+    def test_pdf_requires_pdf_extension(self):
+        wrong_extension = self._temp_file(".txt")
+        with self.assertRaisesRegex(ValueError, "PDF file must have .pdf extension"):
+            PageIndex(PageIndexConfig(pdf_path=wrong_extension))
+
+    def test_pdf_requires_existing_file(self):
+        missing_pdf = os.path.join(tempfile.gettempdir(), "missing_pageindex_input.pdf")
+        with self.assertRaisesRegex(ValueError, f"PDF file not found: {missing_pdf}"):
+            PageIndex(PageIndexConfig(pdf_path=missing_pdf))
+
+    def test_markdown_requires_markdown_extension(self):
+        wrong_extension = self._temp_file(".txt")
+        with self.assertRaisesRegex(ValueError, "Markdown file must have .md or .markdown extension"):
+            PageIndex(PageIndexConfig(md_path=wrong_extension))
+
+    def test_markdown_requires_existing_file(self):
+        missing_md = os.path.join(tempfile.gettempdir(), "missing_pageindex_input.md")
+        with self.assertRaisesRegex(ValueError, f"Markdown file not found: {missing_md}"):
+            PageIndex(PageIndexConfig(md_path=missing_md))
+
+    def test_accepts_valid_pdf_path(self):
+        pdf_path = self._temp_file(".pdf")
+        page_index = PageIndex(PageIndexConfig(pdf_path=pdf_path))
+        self.assertEqual(page_index._doc_kind, "pdf")
+        self.assertEqual(page_index._doc_path, pdf_path)
+
+    def test_accepts_valid_markdown_path(self):
+        md_path = self._temp_file(".md")
+        page_index = PageIndex(PageIndexConfig(md_path=md_path))
+        self.assertEqual(page_index._doc_kind, "markdown")
+        self.assertEqual(page_index._doc_path, md_path)
+
+if __name__ == "__main__":
+    unittest.main()

From 3cb3d7c253d59efc14ecdab5d4447bfde8e43dda Mon Sep 17 00:00:00 2001
From: Chetan Ranpariya <ranpariyachetan@users.noreply.github.com>
Date: Fri, 10 Apr 2026 00:29:20 +0800
Subject: [PATCH 3/5] Updated .gitignore file to ignore build outputs.

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 23d6b5655..4fc175a2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,7 @@ __pycache__
 .env*
 .venv/
 logs/
+
+build/
+dist/
+*.egg-info/
\ No newline at end of file

From cbf33f2319df87c31906703a9d1e04bf88fd58fd Mon Sep 17 00:00:00 2001
From: Chetan Ranpariya <ranpariyachetan@users.noreply.github.com>
Date: Fri, 10 Apr 2026 00:29:47 +0800
Subject: [PATCH 4/5] Rearranged code to make new classes exportable.

---
 PageIndex.py                       | 141 +----------------------------
 pageindex/__init__.py              |  15 ++-
 pageindex/pageindex_runner.py      | 140 ++++++++++++++++++++++++++++
 pyproject.toml                     |  23 +++++
 tests/test_pageindex_unit.py       |   2 +-
 tests/test_pageindex_validation.py |   2 +-
 6 files changed, 178 insertions(+), 145 deletions(-)
 create mode 100644 pageindex/pageindex_runner.py
 create mode 100644 pyproject.toml

diff --git a/PageIndex.py b/PageIndex.py
index 9f43130b7..978093329 100644
--- a/PageIndex.py
+++ b/PageIndex.py
@@ -1,140 +1,3 @@
-import asyncio
-import json
-import os
-from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from pageindex.pageindex_runner import PageIndex, PageIndexConfig
 
-
-@dataclass
-class PageIndexConfig:
-    pdf_path: Optional[str] = None
-    md_path: Optional[str] = None
-    model: Optional[str] = None
-    toc_check_pages: Optional[int] = None
-    max_pages_per_node: Optional[int] = None
-    max_tokens_per_node: Optional[int] = None
-    add_node_id: Optional[bool] = None
-    add_node_summary: Optional[bool] = None
-    add_doc_description: Optional[bool] = None
-    add_node_text: Optional[bool] = None
-    if_thinning: bool = False
-    thinning_threshold: int = 5000
-    summary_token_threshold: int = 200
-    output_dir: str = "./results"
-
-
-class PageIndex:
-    """Public API for generating structure from PDF or Markdown documents."""
-
-    def __init__(self, config: PageIndexConfig):
-        self.config = config
-        self._doc_kind: Optional[str] = None
-        self._doc_path: Optional[str] = None
-        self._validate_and_resolve_input()
-
-    def run(self) -> Dict[str, Any]:
-        if self._is_pdf():
-            return self._process_pdf()
-        if self._is_markdown():
-            return self._process_markdown()
-        raise ValueError("Input file must be a PDF or Markdown document.")
-
-    def run_and_save(self) -> str:
-        result = self.run()
-        output_file = self._build_output_file()
-        os.makedirs(self.config.output_dir, exist_ok=True)
-        with open(output_file, "w", encoding="utf-8") as file:
-            json.dump(result, file, indent=2, ensure_ascii=False)
-        return output_file
-
-    def _process_pdf(self) -> Dict[str, Any]:
-        from pageindex.page_index import page_index_main
-        from pageindex.utils import ConfigLoader
-
-        user_opt = {
-            "model": self.config.model,
-            "toc_check_page_num": self.config.toc_check_pages,
-            "max_page_num_each_node": self.config.max_pages_per_node,
-            "max_token_num_each_node": self.config.max_tokens_per_node,
-            "if_add_node_id": self._to_yes_no(self.config.add_node_id),
-            "if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
-            "if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
-            "if_add_node_text": self._to_yes_no(self.config.add_node_text),
-        }
-        opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})
-        return page_index_main(self._doc_path, opt)
-
-    def _process_markdown(self) -> Dict[str, Any]:
-        from pageindex.page_index_md import md_to_tree
-        from pageindex.utils import ConfigLoader
-
-        user_opt = {
-            "model": self.config.model,
-            "if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
-            "if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
-            "if_add_node_text": self._to_yes_no(self.config.add_node_text),
-            "if_add_node_id": self._to_yes_no(self.config.add_node_id),
-        }
-        opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})
-
-        return asyncio.run(
-            md_to_tree(
-                md_path=self._doc_path,
-                if_thinning=self.config.if_thinning,
-                min_token_threshold=self.config.thinning_threshold,
-                if_add_node_summary=opt.if_add_node_summary,
-                summary_token_threshold=self.config.summary_token_threshold,
-                model=opt.model,
-                if_add_doc_description=opt.if_add_doc_description,
-                if_add_node_text=opt.if_add_node_text,
-                if_add_node_id=opt.if_add_node_id,
-            )
-        )
-
-    def _build_output_file(self) -> str:
-        base_name = os.path.splitext(os.path.basename(self._doc_path))[0]
-        return os.path.join(self.config.output_dir, f"{base_name}_structure.json")
-
-    def _validate_and_resolve_input(self) -> None:
-        pdf_path = self.config.pdf_path
-        md_path = self.config.md_path
-        if not pdf_path and not md_path:
-            raise ValueError("Either --pdf_path or --md_path must be specified")
-        if pdf_path and md_path:
-            raise ValueError("Only one of --pdf_path or --md_path can be specified")
-
-        if pdf_path:
-            self._validate_pdf(pdf_path)
-            self._doc_kind = "pdf"
-            self._doc_path = pdf_path
-            return
-
-        self._validate_markdown(md_path)
-        self._doc_kind = "markdown"
-        self._doc_path = md_path
-
-    @staticmethod
-    def _validate_pdf(path: str) -> None:
-        if not path.lower().endswith(".pdf"):
-            raise ValueError("PDF file must have .pdf extension")
-        if not os.path.isfile(path):
-            raise ValueError(f"PDF file not found: {path}")
-
-    @staticmethod
-    def _validate_markdown(path: str) -> None:
-        if not path.lower().endswith((".md", ".markdown")):
-            raise ValueError("Markdown file must have .md or .markdown extension")
-        if not os.path.isfile(path):
-            raise ValueError(f"Markdown file not found: {path}")
-
-    def _is_pdf(self) -> bool:
-        return self._doc_kind == "pdf"
-
-    def _is_markdown(self) -> bool:
-        return self._doc_kind == "markdown"
-
-    @staticmethod
-    def _to_yes_no(value: Optional[bool]) -> Optional[str]:
-        if value is None:
-            return None
-        return "yes" if value else "no"
+__all__ = ["PageIndex", "PageIndexConfig"]
diff --git a/pageindex/__init__.py b/pageindex/__init__.py
index 658003bf5..dff6f8fc1 100644
--- a/pageindex/__init__.py
+++ b/pageindex/__init__.py
@@ -1,4 +1,11 @@
-from .page_index import *
-from .page_index_md import md_to_tree
-from .retrieve import get_document, get_document_structure, get_page_content
-from .client import PageIndexClient
+from .pageindex_runner import PageIndex, PageIndexConfig
+
+try:
+    from .page_index import *
+    from .page_index_md import md_to_tree
+    from .retrieve import get_document, get_document_structure, get_page_content
+    from .client import PageIndexClient
+except ModuleNotFoundError:
+    # Allows importing lightweight APIs (e.g. PageIndex config/validation)
+    # before runtime dependencies are installed.
+    pass
diff --git a/pageindex/pageindex_runner.py b/pageindex/pageindex_runner.py
new file mode 100644
index 000000000..9f43130b7
--- /dev/null
+++ b/pageindex/pageindex_runner.py
@@ -0,0 +1,140 @@
+import asyncio
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class PageIndexConfig:
+    pdf_path: Optional[str] = None
+    md_path: Optional[str] = None
+    model: Optional[str] = None
+    toc_check_pages: Optional[int] = None
+    max_pages_per_node: Optional[int] = None
+    max_tokens_per_node: Optional[int] = None
+    add_node_id: Optional[bool] = None
+    add_node_summary: Optional[bool] = None
+    add_doc_description: Optional[bool] = None
+    add_node_text: Optional[bool] = None
+    if_thinning: bool = False
+    thinning_threshold: int = 5000
+    summary_token_threshold: int = 200
+    output_dir: str = "./results"
+
+
+class PageIndex:
+    """Public API for generating structure from PDF or Markdown documents."""
+
+    def __init__(self, config: PageIndexConfig):
+        self.config = config
+        self._doc_kind: Optional[str] = None
+        self._doc_path: Optional[str] = None
+        self._validate_and_resolve_input()
+
+    def run(self) -> Dict[str, Any]:
+        if self._is_pdf():
+            return self._process_pdf()
+        if self._is_markdown():
+            return self._process_markdown()
+        raise ValueError("Input file must be a PDF or Markdown document.")
+
+    def run_and_save(self) -> str:
+        result = self.run()
+        output_file = self._build_output_file()
+        os.makedirs(self.config.output_dir, exist_ok=True)
+        with open(output_file, "w", encoding="utf-8") as file:
+            json.dump(result, file, indent=2, ensure_ascii=False)
+        return output_file
+
+    def _process_pdf(self) -> Dict[str, Any]:
+        from pageindex.page_index import page_index_main
+        from pageindex.utils import ConfigLoader
+
+        user_opt = {
+            "model": self.config.model,
+            "toc_check_page_num": self.config.toc_check_pages,
+            "max_page_num_each_node": self.config.max_pages_per_node,
+            "max_token_num_each_node": self.config.max_tokens_per_node,
+            "if_add_node_id": self._to_yes_no(self.config.add_node_id),
+            "if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
+            "if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
+            "if_add_node_text": self._to_yes_no(self.config.add_node_text),
+        }
+        opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})
+        return page_index_main(self._doc_path, opt)
+
+    def _process_markdown(self) -> Dict[str, Any]:
+        from pageindex.page_index_md import md_to_tree
+        from pageindex.utils import ConfigLoader
+
+        user_opt = {
+            "model": self.config.model,
+            "if_add_node_summary": self._to_yes_no(self.config.add_node_summary),
+            "if_add_doc_description": self._to_yes_no(self.config.add_doc_description),
+            "if_add_node_text": self._to_yes_no(self.config.add_node_text),
+            "if_add_node_id": self._to_yes_no(self.config.add_node_id),
+        }
+        opt = ConfigLoader().load({key: value for key, value in user_opt.items() if value is not None})
+
+        return asyncio.run(
+            md_to_tree(
+                md_path=self._doc_path,
+                if_thinning=self.config.if_thinning,
+                min_token_threshold=self.config.thinning_threshold,
+                if_add_node_summary=opt.if_add_node_summary,
+                summary_token_threshold=self.config.summary_token_threshold,
+                model=opt.model,
+                if_add_doc_description=opt.if_add_doc_description,
+                if_add_node_text=opt.if_add_node_text,
+                if_add_node_id=opt.if_add_node_id,
+            )
+        )
+
+    def _build_output_file(self) -> str:
+        base_name = os.path.splitext(os.path.basename(self._doc_path))[0]
+        return os.path.join(self.config.output_dir, f"{base_name}_structure.json")
+
+    def _validate_and_resolve_input(self) -> None:
+        pdf_path = self.config.pdf_path
+        md_path = self.config.md_path
+        if not pdf_path and not md_path:
+            raise ValueError("Either --pdf_path or --md_path must be specified")
+        if pdf_path and md_path:
+            raise ValueError("Only one of --pdf_path or --md_path can be specified")
+
+        if pdf_path:
+            self._validate_pdf(pdf_path)
+            self._doc_kind = "pdf"
+            self._doc_path = pdf_path
+            return
+
+        self._validate_markdown(md_path)
+        self._doc_kind = "markdown"
+        self._doc_path = md_path
+
+    @staticmethod
+    def _validate_pdf(path: str) -> None:
+        if not path.lower().endswith(".pdf"):
+            raise ValueError("PDF file must have .pdf extension")
+        if not os.path.isfile(path):
+            raise ValueError(f"PDF file not found: {path}")
+
+    @staticmethod
+    def _validate_markdown(path: str) -> None:
+        if not path.lower().endswith((".md", ".markdown")):
+            raise ValueError("Markdown file must have .md or .markdown extension")
+        if not os.path.isfile(path):
+            raise ValueError(f"Markdown file not found: {path}")
+
+    def _is_pdf(self) -> bool:
+        return self._doc_kind == "pdf"
+
+    def _is_markdown(self) -> bool:
+        return self._doc_kind == "markdown"
+
+    @staticmethod
+    def _to_yes_no(value: Optional[bool]) -> Optional[str]:
+        if value is None:
+            return None
+        return "yes" if value else "no"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..4a725f820
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,23 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pageindex"
+version = "0.1.0"
+description = "Vectorless, reasoning-based RAG with hierarchical document indexing"
+readme = "README.md"
+requires-python = ">=3.9"
+dependencies = [
+  "litellm==1.83.0",
+  "pymupdf==1.26.4",
+  "PyPDF2==3.0.1",
+  "python-dotenv==1.1.0",
+  "pyyaml==6.0.2"
+]
+
+[tool.setuptools.packages.find]
+include = ["pageindex*"]
+
+[tool.setuptools.package-data]
+pageindex = ["config.yaml"]
diff --git a/tests/test_pageindex_unit.py b/tests/test_pageindex_unit.py
index 1fa3500e8..b21011de4 100644
--- a/tests/test_pageindex_unit.py
+++ b/tests/test_pageindex_unit.py
@@ -5,7 +5,7 @@
 import unittest
 from unittest import mock
 
-from PageIndex import PageIndex, PageIndexConfig
+from pageindex import PageIndex, PageIndexConfig
 
 
 class DummyOpt:
diff --git a/tests/test_pageindex_validation.py b/tests/test_pageindex_validation.py
index 7840bb331..67044edc0 100644
--- a/tests/test_pageindex_validation.py
+++ b/tests/test_pageindex_validation.py
@@ -2,7 +2,7 @@
 import tempfile
 import unittest
 
-from PageIndex import PageIndex, PageIndexConfig
+from pageindex import PageIndex, PageIndexConfig
 
 
 class TestPageIndexValidationParity(unittest.TestCase):

From 42254481a0b3b79f1f93588ef435ddebf26f9002 Mon Sep 17 00:00:00 2001
From: Chetan Ranpariya <ranpariyachetan@users.noreply.github.com>
Date: Fri, 10 Apr 2026 00:34:23 +0800
Subject: [PATCH 5/5] Updated README.md file with new instructions of
 installing and using as module.

---
 README.md | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index a85fbd01d..59d557e2d 100644
--- a/README.md
+++ b/README.md
@@ -141,23 +141,68 @@ You can generate the PageIndex tree structure with this open-source repo, or use
 
 # ⚙️ Package Usage
 
-You can follow these steps to generate a PageIndex tree from a PDF document.
+You can install and use PageIndex either as a Python package (class API) or via CLI.
 
-### 1. Install dependencies
+### 1. Install
+
+From source:
+
+```bash
+pip3 install .
+```
+
+For development (editable install):
 
 ```bash
-pip3 install --upgrade -r requirements.txt
+pip3 install -e .
 ```
 
 ### 2. Set your LLM API key
 
-Create a `.env` file in the root directory with your LLM API key, with multi-LLM support via [LiteLLM](https://docs.litellm.ai/docs/providers):
+Create a `.env` file in your working directory with your LLM API key, with multi-LLM support via [LiteLLM](https://docs.litellm.ai/docs/providers):
 
 ```bash
 OPENAI_API_KEY=your_openai_key_here
 ```
 
-### 3. Generate PageIndex structure for your PDF
+### 3. Use the class API (recommended for integration)
+
+```python
+from pageindex import PageIndex, PageIndexConfig
+
+config = PageIndexConfig(
+    pdf_path="/path/to/your/document.pdf",
+    model="gpt-4o-2024-11-20",   # optional
+    add_node_summary=True,       # optional
+    add_doc_description=True,    # optional
+    add_node_text=False,         # optional
+    add_node_id=True,            # optional
+    output_dir="./results"       # optional
+)
+
+# Get result in memory
+result = PageIndex(config).run()
+
+# Or run and save as JSON
+output_file = PageIndex(config).run_and_save()
+print(output_file)
+```
+
+Markdown input is also supported:
+
+```python
+from pageindex import PageIndex, PageIndexConfig
+
+config = PageIndexConfig(
+    md_path="/path/to/your/document.md",
+    if_thinning=False,            # optional
+    thinning_threshold=5000,      # optional
+    summary_token_threshold=200,  # optional
+)
+result = PageIndex(config).run()
+```
+
+### 4. Use the CLI
 
 ```bash
 python3 run_pageindex.py --pdf_path /path/to/your/document.pdf