[AUR-429] Add MVP pipeline with Ingestion and QA stage (#39)

* add base Tool * minor update test_tool * update test dependency * update test dependency * Fix namespace conflict * update test * add base Agent Interface, add ReWoo Agent * minor update * update test * fix typo * remove unneeded print * update rewoo agent * add LLMTool * update BaseAgent type * add ReAct agent * add ReAct agent * minor update * minor update * minor update * minor update * update base reader with BaseComponent * add splitter * update agent and tool * update vectorstores * update load/save for indexing and retrieving pipeline * update test_agent for more use-cases * add missing dependency for test * update test case for in memory vectorstore * add TextSplitter to BaseComponent * update type hint basetool * add insurance mvp pipeline * update requirements * Remove redundant plugins param * Mock GoogleSearch --------- Co-authored-by: trducng <trungduc1992@gmail.com>
2023-10-05 12:31:33 +07:00
parent 2638152054
commit 79cc60e6a2
9 changed files with 389 additions and 4 deletions
--- a/knowledgehub/embeddings/openai.py
+++ b/knowledgehub/embeddings/openai.py
@@ -22,4 +22,10 @@ class AzureOpenAIEmbeddings(LangchainEmbeddings):

    def __init__(self, **params):
        params["openai_api_type"] = "azure"
+
+        # openai.error.InvalidRequestError: Too many inputs. The max number of
+        # inputs is 16.  We hope to increase the number of inputs per request
+        # soon. Please contact us through an Azure support request at:
+        # https://go.microsoft.com/fwlink/?linkid=2213926 for further questions.
+        params["chunk_size"] = 16
        super().__init__(**params)
--- a/knowledgehub/pipelines/ingest.py
+++ b/knowledgehub/pipelines/ingest.py
@@ -0,0 +1,149 @@
+import os
+from pathlib import Path
+from typing import List, Optional, Union
+
+from theflow import Node, Param
+
+from kotaemon.base import BaseComponent
+from kotaemon.docstores import InMemoryDocumentStore
+from kotaemon.embeddings import AzureOpenAIEmbeddings
+from kotaemon.loaders import (
+    AutoReader,
+    DirectoryReader,
+    MathpixPDFReader,
+    PandasExcelReader,
+)
+from kotaemon.parsers.splitter import SimpleNodeParser
+from kotaemon.pipelines.agents import BaseAgent
+from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
+from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
+from kotaemon.vectorstores import InMemoryVectorStore
+
+from .qa import AgentQAPipeline, QuestionAnsweringPipeline
+from .utils import file_names_to_collection_name
+
+
+class ReaderIndexingPipeline(BaseComponent):
+    """
+    Indexing pipeline which takes input from list of files
+    and perform ingestion to vectorstore
+    """
+
+    # Expose variables for users to switch in prompt ui
+    storage_path: Path = Path("./storage")
+    reader_name: str = "normal"  # "normal" or "mathpix"
+    openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/"
+    openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
+    chunk_size: int = 1024
+    chunk_overlap: int = 256
+    file_name_list: List[str] = list()
+
+    @Param.decorate()
+    def vector_store(self):
+        return InMemoryVectorStore()
+
+    @Param.decorate()
+    def doc_store(self):
+        doc_store = InMemoryDocumentStore()
+        return doc_store
+
+    @Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
+    def embedding(self):
+        return AzureOpenAIEmbeddings(
+            model="text-embedding-ada-002",
+            deployment="dummy-q2-text-embedding",
+            openai_api_base=self.openai_api_base,
+            openai_api_key=self.openai_api_key,
+        )
+
+    def get_reader(self, input_files: List[Union[str, Path]]):
+        # document parsers
+        file_extractor = {
+            ".xlsx": PandasExcelReader(),
+        }
+        if self.reader_name == "normal":
+            file_extractor[".pdf"] = AutoReader("UnstructuredReader")
+        else:
+            file_extractor[".pdf"] = MathpixPDFReader()
+        main_reader = DirectoryReader(
+            input_files=input_files,
+            file_extractor=file_extractor,
+        )
+        return main_reader
+
+    @Node.decorate(depends_on=["doc_store", "vector_store", "embedding"])
+    def indexing_vector_pipeline(self):
+        return IndexVectorStoreFromDocumentPipeline(
+            doc_store=self.doc_store,
+            vector_store=self.vector_store,
+            embedding=self.embedding,
+        )
+
+    @Node.decorate(depends_on=["chunk_size", "chunk_overlap"])
+    def text_splitter(self):
+        # chunking using NodeParser from llama-index
+        return SimpleNodeParser(
+            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
+        )
+
+    def run(
+        self,
+        file_path_list: Union[List[Union[str, Path]], Union[str, Path]],
+        force_reindex: Optional[bool] = False,
+    ):
+        self.storage_path.mkdir(exist_ok=True)
+
+        if not isinstance(file_path_list, list):
+            file_path_list = [file_path_list]
+
+        self.file_name_list = [Path(path).stem for path in file_path_list]
+        collection_name = file_names_to_collection_name(self.file_name_list)
+
+        file_storage_path = self.storage_path / collection_name
+
+        # skip indexing if storage path exist
+        if force_reindex or not file_storage_path.exists():
+            file_storage_path.mkdir(exist_ok=True)
+            # reader call
+            documents = self.get_reader(input_files=file_path_list)()
+            nodes = self.text_splitter(documents)
+            self.log_progress(".num_docs", num_docs=len(nodes))
+
+            self.indexing_vector_pipeline(nodes)
+            # persist right after indexing
+            self.indexing_vector_pipeline.save(file_storage_path)
+        else:
+            self.indexing_vector_pipeline.load(file_storage_path)
+
+    def to_retrieving_pipeline(self):
+        retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
+            vector_store=self.vector_store,
+            doc_store=self.doc_store,
+            embedding=self.embedding,
+        )
+        return retrieving_pipeline
+
+    def to_qa_pipeline(self, llm: BaseComponent, **kwargs):
+        qa_pipeline = QuestionAnsweringPipeline(
+            storage_path=self.storage_path,
+            file_name_list=self.file_name_list,
+            vector_store=self.vector_store,
+            doc_score=self.doc_store,
+            embedding=self.embedding,
+            llm=llm,
+            **kwargs
+        )
+        return qa_pipeline
+
+    def to_agent_pipeline(self, agent: BaseAgent, **kwargs):
+        agent_pipeline = AgentQAPipeline(
+            storage_path=self.storage_path,
+            file_name_list=self.file_name_list,
+            vector_store=self.vector_store,
+            doc_score=self.doc_store,
+            embedding=self.embedding,
+            agent=agent,
+            **kwargs
+        )
+        agent_pipeline.add_search_tool()
+        return agent_pipeline
--- a/knowledgehub/pipelines/qa.py
+++ b/knowledgehub/pipelines/qa.py
@@ -0,0 +1,130 @@
+import os
+from pathlib import Path
+from typing import List
+
+from theflow import Node, Param
+
+from kotaemon.base import BaseComponent
+from kotaemon.docstores import InMemoryDocumentStore
+from kotaemon.documents.base import RetrievedDocument
+from kotaemon.embeddings import AzureOpenAIEmbeddings
+from kotaemon.llms.chats.openai import AzureChatOpenAI
+from kotaemon.pipelines.agents import BaseAgent
+from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
+from kotaemon.pipelines.tools import ComponentTool
+from kotaemon.prompt.template import PromptTemplate
+from kotaemon.vectorstores import InMemoryVectorStore
+
+from .utils import file_names_to_collection_name
+
+
+class QuestionAnsweringPipeline(BaseComponent):
+    """
+    Question Answering pipeline ultilizing a child Retrieving pipeline
+    """
+
+    storage_path: Path = Path("./storage")
+    retrieval_top_k: int = 3
+    openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/"
+    openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
+    file_name_list: List[str]
+    """List of filename, incombination with storage_path to
+    create persistent path of vectorstore"""
+    prompt_template: PromptTemplate = PromptTemplate(
+        'Answer the following question: "{question}". '
+        "The context is: \n{context}\nAnswer: "
+    )
+
+    @Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
+    def llm(self):
+        return AzureChatOpenAI(
+            openai_api_base="https://bleh-dummy-2.openai.azure.com/",
+            openai_api_key=self.openai_api_key,
+            openai_api_version="2023-03-15-preview",
+            deployment_name="dummy-q2-gpt35",
+            temperature=0,
+            request_timeout=60,
+        )
+
+    @Param.decorate()
+    def vector_store(self):
+        return InMemoryVectorStore()
+
+    @Param.decorate()
+    def doc_store(self):
+        doc_store = InMemoryDocumentStore()
+        return doc_store
+
+    @Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
+    def embedding(self):
+        return AzureOpenAIEmbeddings(
+            model="text-embedding-ada-002",
+            deployment="dummy-q2-text-embedding",
+            openai_api_base=self.openai_api_base,
+            openai_api_key=self.openai_api_key,
+        )
+
+    @Node.decorate(depends_on=["doc_store", "vector_store", "embedding"])
+    def retrieving_pipeline(self):
+        retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
+            vector_store=self.vector_store,
+            doc_store=self.doc_store,
+            embedding=self.embedding,
+        )
+        # load persistent from selected path
+        collection_name = file_names_to_collection_name(self.file_name_list)
+        retrieving_pipeline.load(self.storage_path / collection_name)
+        return retrieving_pipeline
+
+    def _format_doc_text(self, text: str) -> str:
+        return text.replace("\n", " ")
+
+    def _format_retrieved_context(self, documents: List[RetrievedDocument]) -> str:
+        matched_texts: List[str] = [
+            self._format_doc_text(doc.text) for doc in documents
+        ]
+        return "\n\n".join(matched_texts)
+
+    def run(self, question: str) -> str:
+        # retrieve relevant documents as context
+        documents = self.retrieving_pipeline(question, top_k=int(self.retrieval_top_k))
+        context = self._format_retrieved_context(documents)
+        self.log_progress(".context", context=context)
+
+        # generate the answer
+        prompt = self.prompt_template.populate(
+            context=context,
+            question=question,
+        )
+        self.log_progress(".prompt", prompt=prompt)
+        answer = self.llm(prompt).text
+        return answer
+
+
+class AgentQAPipeline(QuestionAnsweringPipeline):
+    """
+    QA pipeline ultilizing a child Retrieving pipeline and a Agent pipeline
+    """
+
+    agent: BaseAgent
+
+    def add_search_tool(self):
+        search_tool = ComponentTool(
+            name="search_doc",
+            description=(
+                "A vector store that searches for similar and "
+                "related content "
+                f"in a document: {' '.join(self.file_name_list)}. "
+                "The result is a huge chunk of text related "
+                "to your search but can also "
+                "contain irrelevant info."
+            ),
+            postprocessor=self._format_retrieved_context,
+            component=self.retrieving_pipeline,
+        )
+        if search_tool not in self.agent.plugins:
+            self.agent.plugins.append(search_tool)
+
+    def run(self, question: str) -> str:
+        answer = self.agent(question).output
+        return answer
--- a/knowledgehub/pipelines/utils.py
+++ b/knowledgehub/pipelines/utils.py
@@ -0,0 +1,17 @@
+import hashlib
+from typing import List
+
+
+def filename_to_hash(filename: str) -> str:
+    """
+    Convert filename to hash to be used as collection name for storage
+    """
+    result = hashlib.md5(filename.encode())
+    return result.hexdigest()
+
+
+def file_names_to_collection_name(file_name_list: List[str]) -> str:
+    """
+    Convert list of filenames to collection name
+    """
+    return filename_to_hash(" ".join(file_name_list))
--- a/setup.py
+++ b/setup.py
@@ -57,6 +57,7 @@ setuptools.setup(
            "googlesearch-python",
            "python-dotenv",
            "pytest-mock",
+            "unstructured[pdf]",
        ],
    },
    entry_points={"console_scripts": ["kh=kotaemon.cli:main"]},
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,15 @@
+import pytest
+
+
+@pytest.fixture(scope="function")
+def mock_google_search(monkeypatch):
+    import googlesearch
+
+    def result(*args, **kwargs):
+        yield googlesearch.SearchResult(
+            url="https://www.cinnamon.is/en/",
+            title="Cinnamon AI",
+            description="Cinnamon AI is an enterprise AI company.",
+        )
+
+    monkeypatch.setattr(googlesearch, "search", result)
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@@ -135,7 +135,7 @@ def llm():
    "openai.api_resources.chat_completion.ChatCompletion.create",
    side_effect=_openai_chat_completion_responses_rewoo,
 )
-def test_rewoo_agent(openai_completion, llm):
+def test_rewoo_agent(openai_completion, llm, mock_google_search):
    plugins = [
        GoogleSearchTool(),
        WikipediaTool(),
@@ -153,7 +153,7 @@ def test_rewoo_agent(openai_completion, llm):
    "openai.api_resources.chat_completion.ChatCompletion.create",
    side_effect=_openai_chat_completion_responses_react,
 )
-def test_react_agent(openai_completion, llm):
+def test_react_agent(openai_completion, llm, mock_google_search):
    plugins = [
        GoogleSearchTool(),
        WikipediaTool(),
@@ -170,7 +170,7 @@ def test_react_agent(openai_completion, llm):
    "openai.api_resources.chat_completion.ChatCompletion.create",
    side_effect=_openai_chat_completion_responses_react,
 )
-def test_react_agent_langchain(openai_completion, llm):
+def test_react_agent_langchain(openai_completion, llm, mock_google_search):
    from langchain.agents import AgentType, initialize_agent

    plugins = [
--- a/tests/test_qa.py
+++ b/tests/test_qa.py
@@ -0,0 +1,67 @@
+import json
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from openai.api_resources.embedding import Embedding
+
+from kotaemon.llms.chats.openai import AzureChatOpenAI
+from kotaemon.pipelines.ingest import ReaderIndexingPipeline
+
+with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
+    openai_embedding = json.load(f)
+
+
+_openai_chat_completion_response = {
+    "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x",
+    "object": "chat.completion",
+    "created": 1692338378,
+    "model": "gpt-35-turbo",
+    "choices": [
+        {
+            "index": 0,
+            "finish_reason": "stop",
+            "message": {
+                "role": "assistant",
+                "content": "Hello! How can I assist you today?",
+            },
+        }
+    ],
+    "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},
+}
+
+
+@pytest.fixture(scope="function")
+def mock_openai_embedding(monkeypatch):
+    monkeypatch.setattr(Embedding, "create", lambda *args, **kwargs: openai_embedding)
+
+
+@patch(
+    "openai.api_resources.chat_completion.ChatCompletion.create",
+    side_effect=lambda *args, **kwargs: _openai_chat_completion_response,
+)
+def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path):
+    indexing_pipeline = ReaderIndexingPipeline(
+        storage=tmp_path, openai_api_key="some-key"
+    )
+    input_file_path = Path(__file__).parent / "resources/dummy.pdf"
+
+    # call ingestion pipeline
+    indexing_pipeline(input_file_path, force_reindex=True)
+    retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline()
+
+    results = retrieving_pipeline("This is a query")
+    assert len(results) == 1
+
+    # create llm
+    llm = AzureChatOpenAI(
+        openai_api_base="https://test.openai.azure.com/",
+        openai_api_key="some-key",
+        openai_api_version="2023-03-15-preview",
+        deployment_name="gpt35turbo",
+        temperature=0,
+        request_timeout=60,
+    )
+    qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key")
+    response = qa_pipeline("Summarize this document.")
+    assert response
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -21,7 +21,7 @@ def mock_openai_embedding(monkeypatch):
    monkeypatch.setattr(Embedding, "create", lambda *args, **kwargs: openai_embedding)


-def test_google_tool():
+def test_google_tool(mock_google_search):
    tool = GoogleSearchTool()
    assert tool.name
    assert tool.description