[AUR-429] Add MVP pipeline with Ingestion and QA stage (#39)

* add base Tool * minor update test_tool * update test dependency * update test dependency * Fix namespace conflict * update test * add base Agent Interface, add ReWoo Agent * minor update * update test * fix typo * remove unneeded print * update rewoo agent * add LLMTool * update BaseAgent type * add ReAct agent * add ReAct agent * minor update * minor update * minor update * minor update * update base reader with BaseComponent * add splitter * update agent and tool * update vectorstores * update load/save for indexing and retrieving pipeline * update test_agent for more use-cases * add missing dependency for test * update test case for in memory vectorstore * add TextSplitter to BaseComponent * update type hint basetool * add insurance mvp pipeline * update requirements * Remove redundant plugins param * Mock GoogleSearch --------- Co-authored-by: trducng <trungduc1992@gmail.com>
2023-10-05 12:31:33 +07:00 · 2023-10-05 12:31:33 +07:00 · 79cc60e6a2
commit 79cc60e6a2
parent 2638152054
9 changed files with 389 additions and 4 deletions
--- a/knowledgehub/embeddings/openai.py
+++ b/knowledgehub/embeddings/openai.py
@ -22,4 +22,10 @@ class AzureOpenAIEmbeddings(LangchainEmbeddings):
    def __init__(self, **params):
        params["openai_api_type"] = "azure"
        # openai.error.InvalidRequestError: Too many inputs. The max number of
        # inputs is 16.  We hope to increase the number of inputs per request
        # soon. Please contact us through an Azure support request at:
        # https://go.microsoft.com/fwlink/?linkid=2213926 for further questions.
        params["chunk_size"] = 16
        super().__init__(**params)
--- a/knowledgehub/pipelines/ingest.py
+++ b/knowledgehub/pipelines/ingest.py
@ -0,0 +1,149 @@
 import os
 from pathlib import Path
 from typing import List, Optional, Union
 from theflow import Node, Param
 from kotaemon.base import BaseComponent
 from kotaemon.docstores import InMemoryDocumentStore
 from kotaemon.embeddings import AzureOpenAIEmbeddings
 from kotaemon.loaders import (
    AutoReader,
    DirectoryReader,
    MathpixPDFReader,
    PandasExcelReader,
 )
 from kotaemon.parsers.splitter import SimpleNodeParser
 from kotaemon.pipelines.agents import BaseAgent
 from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
 from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
 from kotaemon.vectorstores import InMemoryVectorStore
 from .qa import AgentQAPipeline, QuestionAnsweringPipeline
 from .utils import file_names_to_collection_name
 class ReaderIndexingPipeline(BaseComponent):
    """
    Indexing pipeline which takes input from list of files
    and perform ingestion to vectorstore
    """
    # Expose variables for users to switch in prompt ui
    storage_path: Path = Path("./storage")
    reader_name: str = "normal"  # "normal" or "mathpix"
    openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/"
    openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
    chunk_size: int = 1024
    chunk_overlap: int = 256
    file_name_list: List[str] = list()
    @Param.decorate()
    def vector_store(self):
        return InMemoryVectorStore()
    @Param.decorate()
    def doc_store(self):
        doc_store = InMemoryDocumentStore()
        return doc_store
    @Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
    def embedding(self):
        return AzureOpenAIEmbeddings(
            model="text-embedding-ada-002",
            deployment="dummy-q2-text-embedding",
            openai_api_base=self.openai_api_base,
            openai_api_key=self.openai_api_key,
        )
    def get_reader(self, input_files: List[Union[str, Path]]):
        # document parsers
        file_extractor = {
            ".xlsx": PandasExcelReader(),
        }
        if self.reader_name == "normal":
            file_extractor[".pdf"] = AutoReader("UnstructuredReader")
        else:
            file_extractor[".pdf"] = MathpixPDFReader()
        main_reader = DirectoryReader(
            input_files=input_files,
            file_extractor=file_extractor,
        )
        return main_reader
    @Node.decorate(depends_on=["doc_store", "vector_store", "embedding"])
    def indexing_vector_pipeline(self):
        return IndexVectorStoreFromDocumentPipeline(
            doc_store=self.doc_store,
            vector_store=self.vector_store,
            embedding=self.embedding,
        )
    @Node.decorate(depends_on=["chunk_size", "chunk_overlap"])
    def text_splitter(self):
        # chunking using NodeParser from llama-index
        return SimpleNodeParser(
            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
        )
    def run(
        self,
        file_path_list: Union[List[Union[str, Path]], Union[str, Path]],
        force_reindex: Optional[bool] = False,
    ):
        self.storage_path.mkdir(exist_ok=True)
        if not isinstance(file_path_list, list):
            file_path_list = [file_path_list]
        self.file_name_list = [Path(path).stem for path in file_path_list]
        collection_name = file_names_to_collection_name(self.file_name_list)
        file_storage_path = self.storage_path / collection_name
        # skip indexing if storage path exist
        if force_reindex or not file_storage_path.exists():
            file_storage_path.mkdir(exist_ok=True)
            # reader call
            documents = self.get_reader(input_files=file_path_list)()
            nodes = self.text_splitter(documents)
            self.log_progress(".num_docs", num_docs=len(nodes))
            self.indexing_vector_pipeline(nodes)
            # persist right after indexing
            self.indexing_vector_pipeline.save(file_storage_path)
        else:
            self.indexing_vector_pipeline.load(file_storage_path)
    def to_retrieving_pipeline(self):
        retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
            vector_store=self.vector_store,
            doc_store=self.doc_store,
            embedding=self.embedding,
        )
        return retrieving_pipeline
    def to_qa_pipeline(self, llm: BaseComponent, **kwargs):
        qa_pipeline = QuestionAnsweringPipeline(
            storage_path=self.storage_path,
            file_name_list=self.file_name_list,
            vector_store=self.vector_store,
            doc_score=self.doc_store,
            embedding=self.embedding,
            llm=llm,
            **kwargs
        )
        return qa_pipeline
    def to_agent_pipeline(self, agent: BaseAgent, **kwargs):
        agent_pipeline = AgentQAPipeline(
            storage_path=self.storage_path,
            file_name_list=self.file_name_list,
            vector_store=self.vector_store,
            doc_score=self.doc_store,
            embedding=self.embedding,
            agent=agent,
            **kwargs
        )
        agent_pipeline.add_search_tool()
        return agent_pipeline
--- a/knowledgehub/pipelines/qa.py
+++ b/knowledgehub/pipelines/qa.py
@ -0,0 +1,130 @@
 import os
 from pathlib import Path
 from typing import List
 from theflow import Node, Param
 from kotaemon.base import BaseComponent
 from kotaemon.docstores import InMemoryDocumentStore
 from kotaemon.documents.base import RetrievedDocument
 from kotaemon.embeddings import AzureOpenAIEmbeddings
 from kotaemon.llms.chats.openai import AzureChatOpenAI
 from kotaemon.pipelines.agents import BaseAgent
 from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
 from kotaemon.pipelines.tools import ComponentTool
 from kotaemon.prompt.template import PromptTemplate
 from kotaemon.vectorstores import InMemoryVectorStore
 from .utils import file_names_to_collection_name
 class QuestionAnsweringPipeline(BaseComponent):
    """
    Question Answering pipeline ultilizing a child Retrieving pipeline
    """
    storage_path: Path = Path("./storage")
    retrieval_top_k: int = 3
    openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/"
    openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
    file_name_list: List[str]
    """List of filename, incombination with storage_path to
    create persistent path of vectorstore"""
    prompt_template: PromptTemplate = PromptTemplate(
        'Answer the following question: "{question}". '
        "The context is: \n{context}\nAnswer: "
    )
    @Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
    def llm(self):
        return AzureChatOpenAI(
            openai_api_base="https://bleh-dummy-2.openai.azure.com/",
            openai_api_key=self.openai_api_key,
            openai_api_version="2023-03-15-preview",
            deployment_name="dummy-q2-gpt35",
            temperature=0,
            request_timeout=60,
        )
    @Param.decorate()
    def vector_store(self):
        return InMemoryVectorStore()
    @Param.decorate()
    def doc_store(self):
        doc_store = InMemoryDocumentStore()
        return doc_store
    @Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
    def embedding(self):
        return AzureOpenAIEmbeddings(
            model="text-embedding-ada-002",
            deployment="dummy-q2-text-embedding",
            openai_api_base=self.openai_api_base,
            openai_api_key=self.openai_api_key,
        )
    @Node.decorate(depends_on=["doc_store", "vector_store", "embedding"])
    def retrieving_pipeline(self):
        retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
            vector_store=self.vector_store,
            doc_store=self.doc_store,
            embedding=self.embedding,
        )
        # load persistent from selected path
        collection_name = file_names_to_collection_name(self.file_name_list)
        retrieving_pipeline.load(self.storage_path / collection_name)
        return retrieving_pipeline
    def _format_doc_text(self, text: str) -> str:
        return text.replace("\n", " ")
    def _format_retrieved_context(self, documents: List[RetrievedDocument]) -> str:
        matched_texts: List[str] = [
            self._format_doc_text(doc.text) for doc in documents
        ]
        return "\n\n".join(matched_texts)
    def run(self, question: str) -> str:
        # retrieve relevant documents as context
        documents = self.retrieving_pipeline(question, top_k=int(self.retrieval_top_k))
        context = self._format_retrieved_context(documents)
        self.log_progress(".context", context=context)
        # generate the answer
        prompt = self.prompt_template.populate(
            context=context,
            question=question,
        )
        self.log_progress(".prompt", prompt=prompt)
        answer = self.llm(prompt).text
        return answer
 class AgentQAPipeline(QuestionAnsweringPipeline):
    """
    QA pipeline ultilizing a child Retrieving pipeline and a Agent pipeline
    """
    agent: BaseAgent
    def add_search_tool(self):
        search_tool = ComponentTool(
            name="search_doc",
            description=(
                "A vector store that searches for similar and "
                "related content "
                f"in a document: {' '.join(self.file_name_list)}. "
                "The result is a huge chunk of text related "
                "to your search but can also "
                "contain irrelevant info."
            ),
            postprocessor=self._format_retrieved_context,
            component=self.retrieving_pipeline,
        )
        if search_tool not in self.agent.plugins:
            self.agent.plugins.append(search_tool)
    def run(self, question: str) -> str:
        answer = self.agent(question).output
        return answer
--- a/knowledgehub/pipelines/utils.py
+++ b/knowledgehub/pipelines/utils.py
@ -0,0 +1,17 @@
 import hashlib
 from typing import List
 def filename_to_hash(filename: str) -> str:
    """
    Convert filename to hash to be used as collection name for storage
    """
    result = hashlib.md5(filename.encode())
    return result.hexdigest()
 def file_names_to_collection_name(file_name_list: List[str]) -> str:
    """
    Convert list of filenames to collection name
    """
    return filename_to_hash(" ".join(file_name_list))
--- a/setup.py
+++ b/setup.py
@ -57,6 +57,7 @@ setuptools.setup(
            "googlesearch-python",
            "python-dotenv",
            "pytest-mock",
            "unstructured[pdf]",
        ],
    },
    entry_points={"console_scripts": ["kh=kotaemon.cli:main"]},
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,15 @@
 import pytest
@pytest.fixture(scope="function")
 def mock_google_search(monkeypatch):
    import googlesearch
    def result(*args, **kwargs):
        yield googlesearch.SearchResult(
            url="https://www.cinnamon.is/en/",
            title="Cinnamon AI",
            description="Cinnamon AI is an enterprise AI company.",
        )
    monkeypatch.setattr(googlesearch, "search", result)
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@ -135,7 +135,7 @@ def llm():
    "openai.api_resources.chat_completion.ChatCompletion.create",
    side_effect=_openai_chat_completion_responses_rewoo,
 )
-def test_rewoo_agent(openai_completion, llm):
+def test_rewoo_agent(openai_completion, llm, mock_google_search):
    plugins = [
        GoogleSearchTool(),
        WikipediaTool(),
@ -153,7 +153,7 @@ def test_rewoo_agent(openai_completion, llm):
    "openai.api_resources.chat_completion.ChatCompletion.create",
    side_effect=_openai_chat_completion_responses_react,
 )
-def test_react_agent(openai_completion, llm):
+def test_react_agent(openai_completion, llm, mock_google_search):
    plugins = [
        GoogleSearchTool(),
        WikipediaTool(),
@ -170,7 +170,7 @@ def test_react_agent(openai_completion, llm):
    "openai.api_resources.chat_completion.ChatCompletion.create",
    side_effect=_openai_chat_completion_responses_react,
 )
-def test_react_agent_langchain(openai_completion, llm):
+def test_react_agent_langchain(openai_completion, llm, mock_google_search):
    from langchain.agents import AgentType, initialize_agent
    plugins = [
--- a/tests/test_qa.py
+++ b/tests/test_qa.py
@ -0,0 +1,67 @@
 import json
 from pathlib import Path
 from unittest.mock import patch
 import pytest
 from openai.api_resources.embedding import Embedding
 from kotaemon.llms.chats.openai import AzureChatOpenAI
 from kotaemon.pipelines.ingest import ReaderIndexingPipeline
 with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
    openai_embedding = json.load(f)
 _openai_chat_completion_response = {
    "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x",
    "object": "chat.completion",
    "created": 1692338378,
    "model": "gpt-35-turbo",
    "choices": [
        {
            "index": 0,
            "finish_reason": "stop",
            "message": {
                "role": "assistant",
                "content": "Hello! How can I assist you today?",
            },
        }
    ],
    "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},
 }
@pytest.fixture(scope="function")
 def mock_openai_embedding(monkeypatch):
    monkeypatch.setattr(Embedding, "create", lambda *args, **kwargs: openai_embedding)
@patch(
    "openai.api_resources.chat_completion.ChatCompletion.create",
    side_effect=lambda *args, **kwargs: _openai_chat_completion_response,
 )
 def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path):
    indexing_pipeline = ReaderIndexingPipeline(
        storage=tmp_path, openai_api_key="some-key"
    )
    input_file_path = Path(__file__).parent / "resources/dummy.pdf"
    # call ingestion pipeline
    indexing_pipeline(input_file_path, force_reindex=True)
    retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline()
    results = retrieving_pipeline("This is a query")
    assert len(results) == 1
    # create llm
    llm = AzureChatOpenAI(
        openai_api_base="https://test.openai.azure.com/",
        openai_api_key="some-key",
        openai_api_version="2023-03-15-preview",
        deployment_name="gpt35turbo",
        temperature=0,
        request_timeout=60,
    )
    qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key")
    response = qa_pipeline("Summarize this document.")
    assert response
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@ -21,7 +21,7 @@ def mock_openai_embedding(monkeypatch):
    monkeypatch.setattr(Embedding, "create", lambda *args, **kwargs: openai_embedding)
-def test_google_tool():
+def test_google_tool(mock_google_search):
    tool = GoogleSearchTool()
    assert tool.name
    assert tool.description