Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex. Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
76 lines
2.2 KiB
Python
76 lines
2.2 KiB
Python
from pathlib import Path
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from theflow import Param
|
|
|
|
from kotaemon.base import BaseComponent, Document
|
|
from kotaemon.indices.extractors import BaseDocParser
|
|
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
|
|
from kotaemon.loaders import (
|
|
AutoReader,
|
|
DirectoryReader,
|
|
MathpixPDFReader,
|
|
OCRReader,
|
|
PandasExcelReader,
|
|
)
|
|
|
|
|
|
class DocumentIngestor(BaseComponent):
|
|
"""Ingest common office document types into Document for indexing
|
|
|
|
Document types:
|
|
- pdf
|
|
- xlsx
|
|
- docx
|
|
"""
|
|
|
|
pdf_mode: str = "normal" # "normal", "mathpix", "ocr"
|
|
doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])
|
|
text_splitter: BaseSplitter = TokenSplitter.withx(
|
|
chunk_size=1024,
|
|
chunk_overlap=256,
|
|
)
|
|
|
|
def _get_reader(self, input_files: list[str | Path]):
|
|
"""Get appropriate readers for the input files based on file extension"""
|
|
file_extractor: dict[str, AutoReader | BaseReader] = {
|
|
".xlsx": PandasExcelReader(),
|
|
}
|
|
|
|
if self.pdf_mode == "normal":
|
|
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
|
|
elif self.pdf_mode == "ocr":
|
|
file_extractor[".pdf"] = OCRReader()
|
|
else:
|
|
file_extractor[".pdf"] = MathpixPDFReader()
|
|
|
|
main_reader = DirectoryReader(
|
|
input_files=input_files,
|
|
file_extractor=file_extractor,
|
|
)
|
|
|
|
return main_reader
|
|
|
|
def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:
|
|
"""Ingest the file paths into Document
|
|
|
|
Args:
|
|
file_paths: list of file paths or a single file path
|
|
|
|
Returns:
|
|
list of parsed Documents
|
|
"""
|
|
if not isinstance(file_paths, list):
|
|
file_paths = [file_paths]
|
|
|
|
documents = self._get_reader(input_files=file_paths)()
|
|
nodes = self.text_splitter(documents)
|
|
self.log_progress(".num_docs", num_docs=len(nodes))
|
|
|
|
# document parsers call
|
|
if self.doc_parsers:
|
|
for parser in self.doc_parsers:
|
|
nodes = parser(nodes)
|
|
|
|
return nodes
|