Refactor the index component and update the MVP insurance accordingly (#90)
Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex. Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
This commit is contained in:
committed by
GitHub
parent
8e3a1d193f
commit
e34b1e4c6d
@@ -5,8 +5,8 @@ from theflow.utils.modules import ObjectInitDeclaration as _
|
||||
|
||||
from kotaemon.base import BaseComponent
|
||||
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||
from kotaemon.indices import VectorRetrieval
|
||||
from kotaemon.llms.completions.openai import AzureOpenAI
|
||||
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||
from kotaemon.storages import ChromaVectorStore
|
||||
|
||||
|
||||
@@ -20,16 +20,14 @@ class Pipeline(BaseComponent):
|
||||
request_timeout=60,
|
||||
)
|
||||
|
||||
retrieving_pipeline: RetrieveDocumentFromVectorStorePipeline = (
|
||||
RetrieveDocumentFromVectorStorePipeline.withx(
|
||||
vector_store=_(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())),
|
||||
embedding=AzureOpenAIEmbeddings.withx(
|
||||
model="text-embedding-ada-002",
|
||||
deployment="embedding-deployment",
|
||||
openai_api_base="https://test.openai.azure.com/",
|
||||
openai_api_key="some-key",
|
||||
),
|
||||
)
|
||||
retrieving_pipeline: VectorRetrieval = VectorRetrieval.withx(
|
||||
vector_store=_(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())),
|
||||
embedding=AzureOpenAIEmbeddings.withx(
|
||||
model="text-embedding-ada-002",
|
||||
deployment="embedding-deployment",
|
||||
openai_api_base="https://test.openai.azure.com/",
|
||||
openai_api_key="some-key",
|
||||
),
|
||||
)
|
||||
|
||||
def run_raw(self, text: str) -> str:
|
||||
|
@@ -4,8 +4,8 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
from openai.types.chat.chat_completion import ChatCompletion
|
||||
|
||||
from kotaemon.indices.qa import CitationPipeline
|
||||
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
||||
from kotaemon.pipelines.citation import CitationPipeline
|
||||
|
||||
function_output = '{\n "question": "What is the provided _example_ benefits?",\n "answer": [\n {\n "fact": "特約死亡保険金: 被保険者がこの特約の保険期間中に死亡したときに支払います。",\n "substring_quote": ["特約死亡保険金"]\n },\n {\n "fact": "特約特定疾病保険金: 被保険者がこの特約の保険期間中に特定の疾病(悪性新生物(がん)、急性心筋梗塞または脳卒中)により所定の状態に該当したときに支払います。",\n "substring_quote": ["特約特定疾病保険金"]\n },\n {\n "fact": "特約障害保険金: 被保険者がこの特約の保険期間中に傷害もしくは疾病により所定の身体障害の状態に該当したとき、または不慮の事故により所定の身体障害の状態に該当したときに支払います。",\n "substring_quote": ["特約障害保険金"]\n },\n {\n "fact": "特約介護保険金: 被保険者がこの特約の保険期間中に傷害または疾病により所定の要介護状態に該当したときに支払います。",\n "substring_quote": ["特約介護保険金"]\n }\n ]\n}'
|
||||
|
||||
|
@@ -2,8 +2,8 @@ from unittest.mock import patch
|
||||
|
||||
from openai.types.chat.chat_completion import ChatCompletion
|
||||
|
||||
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
||||
from kotaemon.pipelines.cot import ManualSequentialChainOfThought, Thought
|
||||
from kotaemon.llms import AzureChatOpenAI
|
||||
from kotaemon.llms.cot import ManualSequentialChainOfThought, Thought
|
||||
|
||||
_openai_chat_completion_response = [
|
||||
ChatCompletion.parse_obj(
|
||||
|
@@ -7,8 +7,7 @@ from openai.resources.embeddings import Embeddings
|
||||
|
||||
from kotaemon.base import Document
|
||||
from kotaemon.embeddings.openai import AzureOpenAIEmbeddings
|
||||
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
|
||||
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||
from kotaemon.indices import VectorIndexing, VectorRetrieval
|
||||
from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore
|
||||
|
||||
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
|
||||
@@ -30,9 +29,7 @@ def test_indexing(mock_openai_embedding, tmp_path):
|
||||
openai_api_key="some-key",
|
||||
)
|
||||
|
||||
pipeline = IndexVectorStoreFromDocumentPipeline(
|
||||
vector_store=db, embedding=embedding, doc_store=doc_store
|
||||
)
|
||||
pipeline = VectorIndexing(vector_store=db, embedding=embedding, doc_store=doc_store)
|
||||
pipeline.doc_store = cast(InMemoryDocumentStore, pipeline.doc_store)
|
||||
pipeline.vector_store = cast(ChromaVectorStore, pipeline.vector_store)
|
||||
assert pipeline.vector_store._collection.count() == 0, "Expected empty collection"
|
||||
@@ -52,10 +49,10 @@ def test_retrieving(mock_openai_embedding, tmp_path):
|
||||
openai_api_key="some-key",
|
||||
)
|
||||
|
||||
index_pipeline = IndexVectorStoreFromDocumentPipeline(
|
||||
index_pipeline = VectorIndexing(
|
||||
vector_store=db, embedding=embedding, doc_store=doc_store
|
||||
)
|
||||
retrieval_pipeline = RetrieveDocumentFromVectorStorePipeline(
|
||||
retrieval_pipeline = VectorRetrieval(
|
||||
vector_store=db, doc_store=doc_store, embedding=embedding
|
||||
)
|
||||
|
||||
|
@@ -1,73 +0,0 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from openai.resources.embeddings import Embeddings
|
||||
from openai.types.chat.chat_completion import ChatCompletion
|
||||
|
||||
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
||||
from kotaemon.pipelines.ingest import ReaderIndexingPipeline
|
||||
|
||||
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
|
||||
openai_embedding = json.load(f)
|
||||
|
||||
|
||||
_openai_chat_completion_response = ChatCompletion.parse_obj(
|
||||
{
|
||||
"id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x",
|
||||
"object": "chat.completion",
|
||||
"created": 1692338378,
|
||||
"model": "gpt-35-turbo",
|
||||
"system_fingerprint": None,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": "stop",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?",
|
||||
"function_call": None,
|
||||
"tool_calls": None,
|
||||
},
|
||||
}
|
||||
],
|
||||
"usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def mock_openai_embedding(monkeypatch):
|
||||
monkeypatch.setattr(Embeddings, "create", lambda *args, **kwargs: openai_embedding)
|
||||
|
||||
|
||||
@patch(
|
||||
"openai.resources.chat.completions.Completions.create",
|
||||
side_effect=lambda *args, **kwargs: _openai_chat_completion_response,
|
||||
)
|
||||
def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path):
|
||||
indexing_pipeline = ReaderIndexingPipeline(
|
||||
storage_path=tmp_path,
|
||||
)
|
||||
indexing_pipeline.embedding.openai_api_key = "some-key"
|
||||
input_file_path = Path(__file__).parent / "resources/dummy.pdf"
|
||||
|
||||
# call ingestion pipeline
|
||||
indexing_pipeline(input_file_path, force_reindex=True)
|
||||
retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline()
|
||||
|
||||
results = retrieving_pipeline("This is a query")
|
||||
assert len(results) == 1
|
||||
|
||||
# create llm
|
||||
llm = AzureChatOpenAI(
|
||||
openai_api_base="https://test.openai.azure.com/",
|
||||
openai_api_key="some-key",
|
||||
openai_api_version="2023-03-15-preview",
|
||||
deployment_name="gpt35turbo",
|
||||
temperature=0,
|
||||
)
|
||||
qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key")
|
||||
response = qa_pipeline("Summarize this document.")
|
||||
assert response
|
@@ -7,8 +7,7 @@ from openai.resources.embeddings import Embeddings
|
||||
from kotaemon.agents.tools import ComponentTool, GoogleSearchTool, WikipediaTool
|
||||
from kotaemon.base import Document
|
||||
from kotaemon.embeddings.openai import AzureOpenAIEmbeddings
|
||||
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
|
||||
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||
from kotaemon.indices.vectorindex import VectorIndexing, VectorRetrieval
|
||||
from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore
|
||||
|
||||
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
|
||||
@@ -46,10 +45,10 @@ def test_pipeline_tool(mock_openai_embedding, tmp_path):
|
||||
openai_api_key="some-key",
|
||||
)
|
||||
|
||||
index_pipeline = IndexVectorStoreFromDocumentPipeline(
|
||||
index_pipeline = VectorIndexing(
|
||||
vector_store=db, embedding=embedding, doc_store=doc_store
|
||||
)
|
||||
retrieval_pipeline = RetrieveDocumentFromVectorStorePipeline(
|
||||
retrieval_pipeline = VectorRetrieval(
|
||||
vector_store=db, doc_store=doc_store, embedding=embedding
|
||||
)
|
||||
|
||||
|
Reference in New Issue
Block a user