Refactor the index component and update the MVP insurance accordingly (#90)

Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex. Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
2023-11-30 18:35:07 +07:00
parent 8e3a1d193f
commit e34b1e4c6d
25 changed files with 396 additions and 605 deletions
--- a/knowledgehub/indices/ingests/init.py
+++ b/knowledgehub/indices/ingests/init.py
@@ -0,0 +1,3 @@
+from .files import DocumentIngestor
+
+__all__ = ["DocumentIngestor"]
--- a/knowledgehub/indices/ingests/files.py
+++ b/knowledgehub/indices/ingests/files.py
@@ -0,0 +1,75 @@
+from pathlib import Path
+
+from llama_index.readers.base import BaseReader
+from theflow import Param
+
+from kotaemon.base import BaseComponent, Document
+from kotaemon.indices.extractors import BaseDocParser
+from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
+from kotaemon.loaders import (
+    AutoReader,
+    DirectoryReader,
+    MathpixPDFReader,
+    OCRReader,
+    PandasExcelReader,
+)
+
+
+class DocumentIngestor(BaseComponent):
+    """Ingest common office document types into Document for indexing
+
+    Document types:
+        - pdf
+        - xlsx
+        - docx
+    """
+
+    pdf_mode: str = "normal"  # "normal", "mathpix", "ocr"
+    doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])
+    text_splitter: BaseSplitter = TokenSplitter.withx(
+        chunk_size=1024,
+        chunk_overlap=256,
+    )
+
+    def _get_reader(self, input_files: list[str | Path]):
+        """Get appropriate readers for the input files based on file extension"""
+        file_extractor: dict[str, AutoReader | BaseReader] = {
+            ".xlsx": PandasExcelReader(),
+        }
+
+        if self.pdf_mode == "normal":
+            file_extractor[".pdf"] = AutoReader("UnstructuredReader")
+        elif self.pdf_mode == "ocr":
+            file_extractor[".pdf"] = OCRReader()
+        else:
+            file_extractor[".pdf"] = MathpixPDFReader()
+
+        main_reader = DirectoryReader(
+            input_files=input_files,
+            file_extractor=file_extractor,
+        )
+
+        return main_reader
+
+    def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:
+        """Ingest the file paths into Document
+
+        Args:
+            file_paths: list of file paths or a single file path
+
+        Returns:
+            list of parsed Documents
+        """
+        if not isinstance(file_paths, list):
+            file_paths = [file_paths]
+
+        documents = self._get_reader(input_files=file_paths)()
+        nodes = self.text_splitter(documents)
+        self.log_progress(".num_docs", num_docs=len(nodes))
+
+        # document parsers call
+        if self.doc_parsers:
+            for parser in self.doc_parsers:
+                nodes = parser(nodes)
+
+        return nodes