diff --git a/knowledgehub/embeddings/openai.py b/knowledgehub/embeddings/openai.py index fbd6f5f..25de270 100644 --- a/knowledgehub/embeddings/openai.py +++ b/knowledgehub/embeddings/openai.py @@ -22,4 +22,10 @@ class AzureOpenAIEmbeddings(LangchainEmbeddings): def __init__(self, **params): params["openai_api_type"] = "azure" + + # openai.error.InvalidRequestError: Too many inputs. The max number of + # inputs is 16. We hope to increase the number of inputs per request + # soon. Please contact us through an Azure support request at: + # https://go.microsoft.com/fwlink/?linkid=2213926 for further questions. + params["chunk_size"] = 16 super().__init__(**params) diff --git a/knowledgehub/pipelines/ingest.py b/knowledgehub/pipelines/ingest.py new file mode 100644 index 0000000..ffafbaf --- /dev/null +++ b/knowledgehub/pipelines/ingest.py @@ -0,0 +1,149 @@ +import os +from pathlib import Path +from typing import List, Optional, Union + +from theflow import Node, Param + +from kotaemon.base import BaseComponent +from kotaemon.docstores import InMemoryDocumentStore +from kotaemon.embeddings import AzureOpenAIEmbeddings +from kotaemon.loaders import ( + AutoReader, + DirectoryReader, + MathpixPDFReader, + PandasExcelReader, +) +from kotaemon.parsers.splitter import SimpleNodeParser +from kotaemon.pipelines.agents import BaseAgent +from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline +from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline +from kotaemon.vectorstores import InMemoryVectorStore + +from .qa import AgentQAPipeline, QuestionAnsweringPipeline +from .utils import file_names_to_collection_name + + +class ReaderIndexingPipeline(BaseComponent): + """ + Indexing pipeline which takes input from list of files + and perform ingestion to vectorstore + """ + + # Expose variables for users to switch in prompt ui + storage_path: Path = Path("./storage") + reader_name: str = "normal" # "normal" or "mathpix" + openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/" + openai_api_key: str = os.environ.get("OPENAI_API_KEY", "") + chunk_size: int = 1024 + chunk_overlap: int = 256 + file_name_list: List[str] = list() + + @Param.decorate() + def vector_store(self): + return InMemoryVectorStore() + + @Param.decorate() + def doc_store(self): + doc_store = InMemoryDocumentStore() + return doc_store + + @Node.decorate(depends_on=["openai_api_base", "openai_api_key"]) + def embedding(self): + return AzureOpenAIEmbeddings( + model="text-embedding-ada-002", + deployment="dummy-q2-text-embedding", + openai_api_base=self.openai_api_base, + openai_api_key=self.openai_api_key, + ) + + def get_reader(self, input_files: List[Union[str, Path]]): + # document parsers + file_extractor = { + ".xlsx": PandasExcelReader(), + } + if self.reader_name == "normal": + file_extractor[".pdf"] = AutoReader("UnstructuredReader") + else: + file_extractor[".pdf"] = MathpixPDFReader() + main_reader = DirectoryReader( + input_files=input_files, + file_extractor=file_extractor, + ) + return main_reader + + @Node.decorate(depends_on=["doc_store", "vector_store", "embedding"]) + def indexing_vector_pipeline(self): + return IndexVectorStoreFromDocumentPipeline( + doc_store=self.doc_store, + vector_store=self.vector_store, + embedding=self.embedding, + ) + + @Node.decorate(depends_on=["chunk_size", "chunk_overlap"]) + def text_splitter(self): + # chunking using NodeParser from llama-index + return SimpleNodeParser( + chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap + ) + + def run( + self, + file_path_list: Union[List[Union[str, Path]], Union[str, Path]], + force_reindex: Optional[bool] = False, + ): + self.storage_path.mkdir(exist_ok=True) + + if not isinstance(file_path_list, list): + file_path_list = [file_path_list] + + self.file_name_list = [Path(path).stem for path in file_path_list] + collection_name = file_names_to_collection_name(self.file_name_list) + + file_storage_path = self.storage_path / collection_name + + # skip indexing if storage path exist + if force_reindex or not file_storage_path.exists(): + file_storage_path.mkdir(exist_ok=True) + # reader call + documents = self.get_reader(input_files=file_path_list)() + nodes = self.text_splitter(documents) + self.log_progress(".num_docs", num_docs=len(nodes)) + + self.indexing_vector_pipeline(nodes) + # persist right after indexing + self.indexing_vector_pipeline.save(file_storage_path) + else: + self.indexing_vector_pipeline.load(file_storage_path) + + def to_retrieving_pipeline(self): + retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline( + vector_store=self.vector_store, + doc_store=self.doc_store, + embedding=self.embedding, + ) + return retrieving_pipeline + + def to_qa_pipeline(self, llm: BaseComponent, **kwargs): + qa_pipeline = QuestionAnsweringPipeline( + storage_path=self.storage_path, + file_name_list=self.file_name_list, + vector_store=self.vector_store, + doc_score=self.doc_store, + embedding=self.embedding, + llm=llm, + **kwargs + ) + return qa_pipeline + + def to_agent_pipeline(self, agent: BaseAgent, **kwargs): + agent_pipeline = AgentQAPipeline( + storage_path=self.storage_path, + file_name_list=self.file_name_list, + vector_store=self.vector_store, + doc_score=self.doc_store, + embedding=self.embedding, + agent=agent, + **kwargs + ) + agent_pipeline.add_search_tool() + return agent_pipeline diff --git a/knowledgehub/pipelines/qa.py b/knowledgehub/pipelines/qa.py new file mode 100644 index 0000000..d7d676d --- /dev/null +++ b/knowledgehub/pipelines/qa.py @@ -0,0 +1,130 @@ +import os +from pathlib import Path +from typing import List + +from theflow import Node, Param + +from kotaemon.base import BaseComponent +from kotaemon.docstores import InMemoryDocumentStore +from kotaemon.documents.base import RetrievedDocument +from kotaemon.embeddings import AzureOpenAIEmbeddings +from kotaemon.llms.chats.openai import AzureChatOpenAI +from kotaemon.pipelines.agents import BaseAgent +from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline +from kotaemon.pipelines.tools import ComponentTool +from kotaemon.prompt.template import PromptTemplate +from kotaemon.vectorstores import InMemoryVectorStore + +from .utils import file_names_to_collection_name + + +class QuestionAnsweringPipeline(BaseComponent): + """ + Question Answering pipeline ultilizing a child Retrieving pipeline + """ + + storage_path: Path = Path("./storage") + retrieval_top_k: int = 3 + openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/" + openai_api_key: str = os.environ.get("OPENAI_API_KEY", "") + file_name_list: List[str] + """List of filename, incombination with storage_path to + create persistent path of vectorstore""" + prompt_template: PromptTemplate = PromptTemplate( + 'Answer the following question: "{question}". ' + "The context is: \n{context}\nAnswer: " + ) + + @Node.decorate(depends_on=["openai_api_base", "openai_api_key"]) + def llm(self): + return AzureChatOpenAI( + openai_api_base="https://bleh-dummy-2.openai.azure.com/", + openai_api_key=self.openai_api_key, + openai_api_version="2023-03-15-preview", + deployment_name="dummy-q2-gpt35", + temperature=0, + request_timeout=60, + ) + + @Param.decorate() + def vector_store(self): + return InMemoryVectorStore() + + @Param.decorate() + def doc_store(self): + doc_store = InMemoryDocumentStore() + return doc_store + + @Node.decorate(depends_on=["openai_api_base", "openai_api_key"]) + def embedding(self): + return AzureOpenAIEmbeddings( + model="text-embedding-ada-002", + deployment="dummy-q2-text-embedding", + openai_api_base=self.openai_api_base, + openai_api_key=self.openai_api_key, + ) + + @Node.decorate(depends_on=["doc_store", "vector_store", "embedding"]) + def retrieving_pipeline(self): + retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline( + vector_store=self.vector_store, + doc_store=self.doc_store, + embedding=self.embedding, + ) + # load persistent from selected path + collection_name = file_names_to_collection_name(self.file_name_list) + retrieving_pipeline.load(self.storage_path / collection_name) + return retrieving_pipeline + + def _format_doc_text(self, text: str) -> str: + return text.replace("\n", " ") + + def _format_retrieved_context(self, documents: List[RetrievedDocument]) -> str: + matched_texts: List[str] = [ + self._format_doc_text(doc.text) for doc in documents + ] + return "\n\n".join(matched_texts) + + def run(self, question: str) -> str: + # retrieve relevant documents as context + documents = self.retrieving_pipeline(question, top_k=int(self.retrieval_top_k)) + context = self._format_retrieved_context(documents) + self.log_progress(".context", context=context) + + # generate the answer + prompt = self.prompt_template.populate( + context=context, + question=question, + ) + self.log_progress(".prompt", prompt=prompt) + answer = self.llm(prompt).text + return answer + + +class AgentQAPipeline(QuestionAnsweringPipeline): + """ + QA pipeline ultilizing a child Retrieving pipeline and a Agent pipeline + """ + + agent: BaseAgent + + def add_search_tool(self): + search_tool = ComponentTool( + name="search_doc", + description=( + "A vector store that searches for similar and " + "related content " + f"in a document: {' '.join(self.file_name_list)}. " + "The result is a huge chunk of text related " + "to your search but can also " + "contain irrelevant info." + ), + postprocessor=self._format_retrieved_context, + component=self.retrieving_pipeline, + ) + if search_tool not in self.agent.plugins: + self.agent.plugins.append(search_tool) + + def run(self, question: str) -> str: + answer = self.agent(question).output + return answer diff --git a/knowledgehub/pipelines/utils.py b/knowledgehub/pipelines/utils.py new file mode 100644 index 0000000..a7554f4 --- /dev/null +++ b/knowledgehub/pipelines/utils.py @@ -0,0 +1,17 @@ +import hashlib +from typing import List + + +def filename_to_hash(filename: str) -> str: + """ + Convert filename to hash to be used as collection name for storage + """ + result = hashlib.md5(filename.encode()) + return result.hexdigest() + + +def file_names_to_collection_name(file_name_list: List[str]) -> str: + """ + Convert list of filenames to collection name + """ + return filename_to_hash(" ".join(file_name_list)) diff --git a/setup.py b/setup.py index 003bf75..ceaa11f 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ setuptools.setup( "googlesearch-python", "python-dotenv", "pytest-mock", + "unstructured[pdf]", ], }, entry_points={"console_scripts": ["kh=kotaemon.cli:main"]}, diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8e30b72 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,15 @@ +import pytest + + +@pytest.fixture(scope="function") +def mock_google_search(monkeypatch): + import googlesearch + + def result(*args, **kwargs): + yield googlesearch.SearchResult( + url="https://www.cinnamon.is/en/", + title="Cinnamon AI", + description="Cinnamon AI is an enterprise AI company.", + ) + + monkeypatch.setattr(googlesearch, "search", result) diff --git a/tests/test_agent.py b/tests/test_agent.py index f2d1535..558b47e 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -135,7 +135,7 @@ def llm(): "openai.api_resources.chat_completion.ChatCompletion.create", side_effect=_openai_chat_completion_responses_rewoo, ) -def test_rewoo_agent(openai_completion, llm): +def test_rewoo_agent(openai_completion, llm, mock_google_search): plugins = [ GoogleSearchTool(), WikipediaTool(), @@ -153,7 +153,7 @@ def test_rewoo_agent(openai_completion, llm): "openai.api_resources.chat_completion.ChatCompletion.create", side_effect=_openai_chat_completion_responses_react, ) -def test_react_agent(openai_completion, llm): +def test_react_agent(openai_completion, llm, mock_google_search): plugins = [ GoogleSearchTool(), WikipediaTool(), @@ -170,7 +170,7 @@ def test_react_agent(openai_completion, llm): "openai.api_resources.chat_completion.ChatCompletion.create", side_effect=_openai_chat_completion_responses_react, ) -def test_react_agent_langchain(openai_completion, llm): +def test_react_agent_langchain(openai_completion, llm, mock_google_search): from langchain.agents import AgentType, initialize_agent plugins = [ diff --git a/tests/test_qa.py b/tests/test_qa.py new file mode 100644 index 0000000..ae4d837 --- /dev/null +++ b/tests/test_qa.py @@ -0,0 +1,67 @@ +import json +from pathlib import Path +from unittest.mock import patch + +import pytest +from openai.api_resources.embedding import Embedding + +from kotaemon.llms.chats.openai import AzureChatOpenAI +from kotaemon.pipelines.ingest import ReaderIndexingPipeline + +with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: + openai_embedding = json.load(f) + + +_openai_chat_completion_response = { + "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", + "object": "chat.completion", + "created": 1692338378, + "model": "gpt-35-turbo", + "choices": [ + { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": "Hello! How can I assist you today?", + }, + } + ], + "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, +} + + +@pytest.fixture(scope="function") +def mock_openai_embedding(monkeypatch): + monkeypatch.setattr(Embedding, "create", lambda *args, **kwargs: openai_embedding) + + +@patch( + "openai.api_resources.chat_completion.ChatCompletion.create", + side_effect=lambda *args, **kwargs: _openai_chat_completion_response, +) +def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path): + indexing_pipeline = ReaderIndexingPipeline( + storage=tmp_path, openai_api_key="some-key" + ) + input_file_path = Path(__file__).parent / "resources/dummy.pdf" + + # call ingestion pipeline + indexing_pipeline(input_file_path, force_reindex=True) + retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline() + + results = retrieving_pipeline("This is a query") + assert len(results) == 1 + + # create llm + llm = AzureChatOpenAI( + openai_api_base="https://test.openai.azure.com/", + openai_api_key="some-key", + openai_api_version="2023-03-15-preview", + deployment_name="gpt35turbo", + temperature=0, + request_timeout=60, + ) + qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key") + response = qa_pipeline("Summarize this document.") + assert response diff --git a/tests/test_tools.py b/tests/test_tools.py index c010862..e5d22ae 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -21,7 +21,7 @@ def mock_openai_embedding(monkeypatch): monkeypatch.setattr(Embedding, "create", lambda *args, **kwargs: openai_embedding) -def test_google_tool(): +def test_google_tool(mock_google_search): tool = GoogleSearchTool() assert tool.name assert tool.description