[AUR-429] Add MVP pipeline with Ingestion and QA stage (#39)
* add base Tool * minor update test_tool * update test dependency * update test dependency * Fix namespace conflict * update test * add base Agent Interface, add ReWoo Agent * minor update * update test * fix typo * remove unneeded print * update rewoo agent * add LLMTool * update BaseAgent type * add ReAct agent * add ReAct agent * minor update * minor update * minor update * minor update * update base reader with BaseComponent * add splitter * update agent and tool * update vectorstores * update load/save for indexing and retrieving pipeline * update test_agent for more use-cases * add missing dependency for test * update test case for in memory vectorstore * add TextSplitter to BaseComponent * update type hint basetool * add insurance mvp pipeline * update requirements * Remove redundant plugins param * Mock GoogleSearch --------- Co-authored-by: trducng <trungduc1992@gmail.com>
This commit is contained in:
committed by
GitHub
parent
2638152054
commit
79cc60e6a2
149
knowledgehub/pipelines/ingest.py
Normal file
149
knowledgehub/pipelines/ingest.py
Normal file
@@ -0,0 +1,149 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from theflow import Node, Param
|
||||
|
||||
from kotaemon.base import BaseComponent
|
||||
from kotaemon.docstores import InMemoryDocumentStore
|
||||
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||
from kotaemon.loaders import (
|
||||
AutoReader,
|
||||
DirectoryReader,
|
||||
MathpixPDFReader,
|
||||
PandasExcelReader,
|
||||
)
|
||||
from kotaemon.parsers.splitter import SimpleNodeParser
|
||||
from kotaemon.pipelines.agents import BaseAgent
|
||||
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
|
||||
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||
from kotaemon.vectorstores import InMemoryVectorStore
|
||||
|
||||
from .qa import AgentQAPipeline, QuestionAnsweringPipeline
|
||||
from .utils import file_names_to_collection_name
|
||||
|
||||
|
||||
class ReaderIndexingPipeline(BaseComponent):
|
||||
"""
|
||||
Indexing pipeline which takes input from list of files
|
||||
and perform ingestion to vectorstore
|
||||
"""
|
||||
|
||||
# Expose variables for users to switch in prompt ui
|
||||
storage_path: Path = Path("./storage")
|
||||
reader_name: str = "normal" # "normal" or "mathpix"
|
||||
openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/"
|
||||
openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
|
||||
chunk_size: int = 1024
|
||||
chunk_overlap: int = 256
|
||||
file_name_list: List[str] = list()
|
||||
|
||||
@Param.decorate()
|
||||
def vector_store(self):
|
||||
return InMemoryVectorStore()
|
||||
|
||||
@Param.decorate()
|
||||
def doc_store(self):
|
||||
doc_store = InMemoryDocumentStore()
|
||||
return doc_store
|
||||
|
||||
@Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
|
||||
def embedding(self):
|
||||
return AzureOpenAIEmbeddings(
|
||||
model="text-embedding-ada-002",
|
||||
deployment="dummy-q2-text-embedding",
|
||||
openai_api_base=self.openai_api_base,
|
||||
openai_api_key=self.openai_api_key,
|
||||
)
|
||||
|
||||
def get_reader(self, input_files: List[Union[str, Path]]):
|
||||
# document parsers
|
||||
file_extractor = {
|
||||
".xlsx": PandasExcelReader(),
|
||||
}
|
||||
if self.reader_name == "normal":
|
||||
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
|
||||
else:
|
||||
file_extractor[".pdf"] = MathpixPDFReader()
|
||||
main_reader = DirectoryReader(
|
||||
input_files=input_files,
|
||||
file_extractor=file_extractor,
|
||||
)
|
||||
return main_reader
|
||||
|
||||
@Node.decorate(depends_on=["doc_store", "vector_store", "embedding"])
|
||||
def indexing_vector_pipeline(self):
|
||||
return IndexVectorStoreFromDocumentPipeline(
|
||||
doc_store=self.doc_store,
|
||||
vector_store=self.vector_store,
|
||||
embedding=self.embedding,
|
||||
)
|
||||
|
||||
@Node.decorate(depends_on=["chunk_size", "chunk_overlap"])
|
||||
def text_splitter(self):
|
||||
# chunking using NodeParser from llama-index
|
||||
return SimpleNodeParser(
|
||||
chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
|
||||
)
|
||||
|
||||
def run(
|
||||
self,
|
||||
file_path_list: Union[List[Union[str, Path]], Union[str, Path]],
|
||||
force_reindex: Optional[bool] = False,
|
||||
):
|
||||
self.storage_path.mkdir(exist_ok=True)
|
||||
|
||||
if not isinstance(file_path_list, list):
|
||||
file_path_list = [file_path_list]
|
||||
|
||||
self.file_name_list = [Path(path).stem for path in file_path_list]
|
||||
collection_name = file_names_to_collection_name(self.file_name_list)
|
||||
|
||||
file_storage_path = self.storage_path / collection_name
|
||||
|
||||
# skip indexing if storage path exist
|
||||
if force_reindex or not file_storage_path.exists():
|
||||
file_storage_path.mkdir(exist_ok=True)
|
||||
# reader call
|
||||
documents = self.get_reader(input_files=file_path_list)()
|
||||
nodes = self.text_splitter(documents)
|
||||
self.log_progress(".num_docs", num_docs=len(nodes))
|
||||
|
||||
self.indexing_vector_pipeline(nodes)
|
||||
# persist right after indexing
|
||||
self.indexing_vector_pipeline.save(file_storage_path)
|
||||
else:
|
||||
self.indexing_vector_pipeline.load(file_storage_path)
|
||||
|
||||
def to_retrieving_pipeline(self):
|
||||
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
|
||||
vector_store=self.vector_store,
|
||||
doc_store=self.doc_store,
|
||||
embedding=self.embedding,
|
||||
)
|
||||
return retrieving_pipeline
|
||||
|
||||
def to_qa_pipeline(self, llm: BaseComponent, **kwargs):
|
||||
qa_pipeline = QuestionAnsweringPipeline(
|
||||
storage_path=self.storage_path,
|
||||
file_name_list=self.file_name_list,
|
||||
vector_store=self.vector_store,
|
||||
doc_score=self.doc_store,
|
||||
embedding=self.embedding,
|
||||
llm=llm,
|
||||
**kwargs
|
||||
)
|
||||
return qa_pipeline
|
||||
|
||||
def to_agent_pipeline(self, agent: BaseAgent, **kwargs):
|
||||
agent_pipeline = AgentQAPipeline(
|
||||
storage_path=self.storage_path,
|
||||
file_name_list=self.file_name_list,
|
||||
vector_store=self.vector_store,
|
||||
doc_score=self.doc_store,
|
||||
embedding=self.embedding,
|
||||
agent=agent,
|
||||
**kwargs
|
||||
)
|
||||
agent_pipeline.add_search_tool()
|
||||
return agent_pipeline
|
Reference in New Issue
Block a user