kotaemon/knowledgehub/indices/ingests/files.py
Duc Nguyen (john) da0ac1d69f Change template to private attribute and simplify imports (#101)
---------

Co-authored-by: ian <ian@cinnamon.is>
2023-12-08 18:10:34 +07:00

75 lines
2.1 KiB
Python

from pathlib import Path
from llama_index.readers.base import BaseReader
from kotaemon.base import BaseComponent, Document, Param
from kotaemon.indices.extractors import BaseDocParser
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
from kotaemon.loaders import (
AutoReader,
DirectoryReader,
MathpixPDFReader,
OCRReader,
PandasExcelReader,
)
class DocumentIngestor(BaseComponent):
"""Ingest common office document types into Document for indexing
Document types:
- pdf
- xlsx
- docx
"""
pdf_mode: str = "normal" # "normal", "mathpix", "ocr"
doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])
text_splitter: BaseSplitter = TokenSplitter.withx(
chunk_size=1024,
chunk_overlap=256,
)
def _get_reader(self, input_files: list[str | Path]):
"""Get appropriate readers for the input files based on file extension"""
file_extractor: dict[str, AutoReader | BaseReader] = {
".xlsx": PandasExcelReader(),
}
if self.pdf_mode == "normal":
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
elif self.pdf_mode == "ocr":
file_extractor[".pdf"] = OCRReader()
else:
file_extractor[".pdf"] = MathpixPDFReader()
main_reader = DirectoryReader(
input_files=input_files,
file_extractor=file_extractor,
)
return main_reader
def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:
"""Ingest the file paths into Document
Args:
file_paths: list of file paths or a single file path
Returns:
list of parsed Documents
"""
if not isinstance(file_paths, list):
file_paths = [file_paths]
documents = self._get_reader(input_files=file_paths)()
nodes = self.text_splitter(documents)
self.log_progress(".num_docs", num_docs=len(nodes))
# document parsers call
if self.doc_parsers:
for parser in self.doc_parsers:
nodes = parser(nodes)
return nodes