Refactor the index component and update the MVP insurance accordingly (#90)
Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex. Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
This commit is contained in:
committed by
GitHub
parent
8e3a1d193f
commit
e34b1e4c6d
3
knowledgehub/indices/ingests/__init__.py
Normal file
3
knowledgehub/indices/ingests/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .files import DocumentIngestor
|
||||
|
||||
__all__ = ["DocumentIngestor"]
|
75
knowledgehub/indices/ingests/files.py
Normal file
75
knowledgehub/indices/ingests/files.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from pathlib import Path
|
||||
|
||||
from llama_index.readers.base import BaseReader
|
||||
from theflow import Param
|
||||
|
||||
from kotaemon.base import BaseComponent, Document
|
||||
from kotaemon.indices.extractors import BaseDocParser
|
||||
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
|
||||
from kotaemon.loaders import (
|
||||
AutoReader,
|
||||
DirectoryReader,
|
||||
MathpixPDFReader,
|
||||
OCRReader,
|
||||
PandasExcelReader,
|
||||
)
|
||||
|
||||
|
||||
class DocumentIngestor(BaseComponent):
|
||||
"""Ingest common office document types into Document for indexing
|
||||
|
||||
Document types:
|
||||
- pdf
|
||||
- xlsx
|
||||
- docx
|
||||
"""
|
||||
|
||||
pdf_mode: str = "normal" # "normal", "mathpix", "ocr"
|
||||
doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])
|
||||
text_splitter: BaseSplitter = TokenSplitter.withx(
|
||||
chunk_size=1024,
|
||||
chunk_overlap=256,
|
||||
)
|
||||
|
||||
def _get_reader(self, input_files: list[str | Path]):
|
||||
"""Get appropriate readers for the input files based on file extension"""
|
||||
file_extractor: dict[str, AutoReader | BaseReader] = {
|
||||
".xlsx": PandasExcelReader(),
|
||||
}
|
||||
|
||||
if self.pdf_mode == "normal":
|
||||
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
|
||||
elif self.pdf_mode == "ocr":
|
||||
file_extractor[".pdf"] = OCRReader()
|
||||
else:
|
||||
file_extractor[".pdf"] = MathpixPDFReader()
|
||||
|
||||
main_reader = DirectoryReader(
|
||||
input_files=input_files,
|
||||
file_extractor=file_extractor,
|
||||
)
|
||||
|
||||
return main_reader
|
||||
|
||||
def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:
|
||||
"""Ingest the file paths into Document
|
||||
|
||||
Args:
|
||||
file_paths: list of file paths or a single file path
|
||||
|
||||
Returns:
|
||||
list of parsed Documents
|
||||
"""
|
||||
if not isinstance(file_paths, list):
|
||||
file_paths = [file_paths]
|
||||
|
||||
documents = self._get_reader(input_files=file_paths)()
|
||||
nodes = self.text_splitter(documents)
|
||||
self.log_progress(".num_docs", num_docs=len(nodes))
|
||||
|
||||
# document parsers call
|
||||
if self.doc_parsers:
|
||||
for parser in self.doc_parsers:
|
||||
nodes = parser(nodes)
|
||||
|
||||
return nodes
|
Reference in New Issue
Block a user