kotaemon/knowledgehub/pipelines/ingest.py
Tuan Anh Nguyen Dang (Tadashi_Cin) 4704e2c11a Add new OCRReader with PDF+OCR text merging (#66)
This change speeds up OCR extraction by allowing bypassing OCR for texts that are irrelevant (not in table).

---------

Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
2023-11-13 17:43:02 +07:00

140 lines
4.9 KiB
Python

import os
from pathlib import Path
from typing import Dict, List, Optional, Union
from llama_index.readers.base import BaseReader
from theflow import Node
from theflow.utils.modules import ObjectInitDeclaration as _
from kotaemon.base import BaseComponent
from kotaemon.docstores import InMemoryDocumentStore
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.loaders import (
AutoReader,
DirectoryReader,
MathpixPDFReader,
PandasExcelReader,
)
from kotaemon.parsers.splitter import SimpleNodeParser
from kotaemon.pipelines.agents import BaseAgent
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.vectorstores import InMemoryVectorStore
from .qa import AgentQAPipeline, QuestionAnsweringPipeline
from .utils import file_names_to_collection_name
class ReaderIndexingPipeline(BaseComponent):
"""
Indexing pipeline which takes input from list of files
and perform ingestion to vectorstore
"""
# Expose variables for users to switch in prompt ui
storage_path: Path = Path("./storage")
reader_name: str = "normal" # "normal" or "mathpix"
chunk_size: int = 1024
chunk_overlap: int = 256
file_name_list: List[str] = list()
vector_store: _[InMemoryVectorStore] = _(InMemoryVectorStore)
doc_store: _[InMemoryDocumentStore] = _(InMemoryDocumentStore)
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
model="text-embedding-ada-002",
deployment="dummy-q2-text-embedding",
openai_api_base="https://bleh-dummy-2.openai.azure.com/",
openai_api_key=os.environ.get("OPENAI_API_KEY", ""),
)
def get_reader(self, input_files: List[Union[str, Path]]):
# document parsers
file_extractor: Dict[str, BaseReader] = {
".xlsx": PandasExcelReader(),
}
if self.reader_name == "normal":
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
else:
file_extractor[".pdf"] = MathpixPDFReader()
main_reader = DirectoryReader(
input_files=input_files,
file_extractor=file_extractor,
)
return main_reader
@Node.auto(depends_on=["doc_store", "vector_store", "embedding"])
def indexing_vector_pipeline(self):
return IndexVectorStoreFromDocumentPipeline(
doc_store=self.doc_store,
vector_store=self.vector_store,
embedding=self.embedding,
)
@Node.auto(depends_on=["chunk_size", "chunk_overlap"])
def text_splitter(self) -> SimpleNodeParser:
return SimpleNodeParser(
chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
)
def run(
self,
file_path_list: Union[List[Union[str, Path]], Union[str, Path]],
force_reindex: Optional[bool] = False,
):
self.storage_path.mkdir(exist_ok=True)
if not isinstance(file_path_list, list):
file_path_list = [file_path_list]
self.file_name_list = [Path(path).stem for path in file_path_list]
collection_name = file_names_to_collection_name(self.file_name_list)
file_storage_path = self.storage_path / collection_name
# skip indexing if storage path exist
if force_reindex or not file_storage_path.exists():
file_storage_path.mkdir(exist_ok=True)
# reader call
documents = self.get_reader(input_files=file_path_list)()
nodes = self.text_splitter(documents)
self.log_progress(".num_docs", num_docs=len(nodes))
self.indexing_vector_pipeline(nodes)
# persist right after indexing
self.indexing_vector_pipeline.save(file_storage_path)
else:
self.indexing_vector_pipeline.load(file_storage_path)
def to_retrieving_pipeline(self):
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
vector_store=self.vector_store,
doc_store=self.doc_store,
embedding=self.embedding,
)
return retrieving_pipeline
def to_qa_pipeline(self, llm: BaseComponent, **kwargs):
qa_pipeline = QuestionAnsweringPipeline(
storage_path=self.storage_path,
file_name_list=self.file_name_list,
vector_store=self.vector_store,
doc_score=self.doc_store,
embedding=self.embedding,
llm=llm,
**kwargs
)
return qa_pipeline
def to_agent_pipeline(self, agent: BaseAgent, **kwargs):
agent_pipeline = AgentQAPipeline(
storage_path=self.storage_path,
file_name_list=self.file_name_list,
vector_store=self.vector_store,
doc_score=self.doc_store,
embedding=self.embedding,
agent=agent,
**kwargs
)
agent_pipeline.add_search_tool()
return agent_pipeline