Refactor the index component and update the MVP insurance accordingly (#90)

Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex.

Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
This commit is contained in:
Duc Nguyen (john)
2023-11-30 18:35:07 +07:00
committed by GitHub
parent 8e3a1d193f
commit e34b1e4c6d
25 changed files with 396 additions and 605 deletions

View File

@@ -7,8 +7,7 @@ from openai.resources.embeddings import Embeddings
from kotaemon.base import Document
from kotaemon.embeddings.openai import AzureOpenAIEmbeddings
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.indices import VectorIndexing, VectorRetrieval
from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
@@ -30,9 +29,7 @@ def test_indexing(mock_openai_embedding, tmp_path):
openai_api_key="some-key",
)
pipeline = IndexVectorStoreFromDocumentPipeline(
vector_store=db, embedding=embedding, doc_store=doc_store
)
pipeline = VectorIndexing(vector_store=db, embedding=embedding, doc_store=doc_store)
pipeline.doc_store = cast(InMemoryDocumentStore, pipeline.doc_store)
pipeline.vector_store = cast(ChromaVectorStore, pipeline.vector_store)
assert pipeline.vector_store._collection.count() == 0, "Expected empty collection"
@@ -52,10 +49,10 @@ def test_retrieving(mock_openai_embedding, tmp_path):
openai_api_key="some-key",
)
index_pipeline = IndexVectorStoreFromDocumentPipeline(
index_pipeline = VectorIndexing(
vector_store=db, embedding=embedding, doc_store=doc_store
)
retrieval_pipeline = RetrieveDocumentFromVectorStorePipeline(
retrieval_pipeline = VectorRetrieval(
vector_store=db, doc_store=doc_store, embedding=embedding
)