From b159897ac62ac8f9e17ccf68bd0564d007281f08 Mon Sep 17 00:00:00 2001 From: "Nguyen Trung Duc (john)" Date: Tue, 14 Nov 2023 17:50:57 +0700 Subject: [PATCH] Combine docstores and vectorstores within a storages component (#72) --- knowledgehub/pipelines/indexing.py | 3 +-- knowledgehub/pipelines/ingest.py | 8 ++++++-- knowledgehub/pipelines/qa.py | 8 ++++++-- knowledgehub/pipelines/retrieving.py | 3 +-- knowledgehub/storages/__init__.py | 12 ++++++++++++ knowledgehub/{ => storages}/docstores/__init__.py | 0 knowledgehub/{ => storages}/docstores/base.py | 2 +- knowledgehub/{ => storages}/docstores/in_memory.py | 2 +- knowledgehub/{ => storages}/vectorstores/__init__.py | 0 knowledgehub/{ => storages}/vectorstores/base.py | 2 +- knowledgehub/{ => storages}/vectorstores/chroma.py | 0 .../{ => storages}/vectorstores/in_memory.py | 2 +- .../{{cookiecutter.project_name}}/pipeline.py | 3 +-- tests/simple_pipeline.py | 2 +- tests/test_docstores.py | 2 +- tests/test_indexing_retrieval.py | 3 +-- tests/test_tools.py | 3 +-- tests/test_vectorstore.py | 2 +- 18 files changed, 36 insertions(+), 21 deletions(-) create mode 100644 knowledgehub/storages/__init__.py rename knowledgehub/{ => storages}/docstores/__init__.py (100%) rename knowledgehub/{ => storages}/docstores/base.py (97%) rename knowledgehub/{ => storages}/docstores/in_memory.py (98%) rename knowledgehub/{ => storages}/vectorstores/__init__.py (100%) rename knowledgehub/{ => storages}/vectorstores/base.py (99%) rename knowledgehub/{ => storages}/vectorstores/chroma.py (100%) rename knowledgehub/{ => storages}/vectorstores/in_memory.py (96%) diff --git a/knowledgehub/pipelines/indexing.py b/knowledgehub/pipelines/indexing.py index c98ca38..97e75a6 100644 --- a/knowledgehub/pipelines/indexing.py +++ b/knowledgehub/pipelines/indexing.py @@ -6,9 +6,8 @@ from pathlib import Path from theflow import Node, Param from ..base import BaseComponent, Document -from ..docstores import BaseDocumentStore from ..embeddings import BaseEmbeddings -from ..vectorstores import BaseVectorStore +from ..storages import BaseDocumentStore, BaseVectorStore VECTOR_STORE_FNAME = "vectorstore" DOC_STORE_FNAME = "docstore" diff --git a/knowledgehub/pipelines/ingest.py b/knowledgehub/pipelines/ingest.py index cca86db..d3f19b8 100644 --- a/knowledgehub/pipelines/ingest.py +++ b/knowledgehub/pipelines/ingest.py @@ -7,7 +7,6 @@ from theflow import Node from theflow.utils.modules import ObjectInitDeclaration as _ from kotaemon.base import BaseComponent -from kotaemon.docstores import BaseDocumentStore, InMemoryDocumentStore from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.loaders import ( AutoReader, @@ -20,7 +19,12 @@ from kotaemon.parsers.splitter import SimpleNodeParser from kotaemon.pipelines.agents import BaseAgent from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline -from kotaemon.vectorstores import BaseVectorStore, InMemoryVectorStore +from kotaemon.storages import ( + BaseDocumentStore, + BaseVectorStore, + InMemoryDocumentStore, + InMemoryVectorStore, +) from .qa import AgentQAPipeline, QuestionAnsweringPipeline from .utils import file_names_to_collection_name diff --git a/knowledgehub/pipelines/qa.py b/knowledgehub/pipelines/qa.py index d7825fa..0763535 100644 --- a/knowledgehub/pipelines/qa.py +++ b/knowledgehub/pipelines/qa.py @@ -7,14 +7,18 @@ from theflow.utils.modules import ObjectInitDeclaration as _ from kotaemon.base import BaseComponent from kotaemon.base.schema import RetrievedDocument -from kotaemon.docstores import BaseDocumentStore, InMemoryDocumentStore from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.llms import PromptTemplate from kotaemon.llms.chats.openai import AzureChatOpenAI from kotaemon.pipelines.agents import BaseAgent from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline from kotaemon.pipelines.tools import ComponentTool -from kotaemon.vectorstores import BaseVectorStore, InMemoryVectorStore +from kotaemon.storages import ( + BaseDocumentStore, + BaseVectorStore, + InMemoryDocumentStore, + InMemoryVectorStore, +) from .utils import file_names_to_collection_name diff --git a/knowledgehub/pipelines/retrieving.py b/knowledgehub/pipelines/retrieving.py index fdd612e..7003391 100644 --- a/knowledgehub/pipelines/retrieving.py +++ b/knowledgehub/pipelines/retrieving.py @@ -7,9 +7,8 @@ from theflow import Node, Param from ..base import BaseComponent from ..base.schema import Document, RetrievedDocument -from ..docstores import BaseDocumentStore from ..embeddings import BaseEmbeddings -from ..vectorstores import BaseVectorStore +from ..storages import BaseDocumentStore, BaseVectorStore VECTOR_STORE_FNAME = "vectorstore" DOC_STORE_FNAME = "docstore" diff --git a/knowledgehub/storages/__init__.py b/knowledgehub/storages/__init__.py new file mode 100644 index 0000000..d700d60 --- /dev/null +++ b/knowledgehub/storages/__init__.py @@ -0,0 +1,12 @@ +from .docstores import BaseDocumentStore, InMemoryDocumentStore +from .vectorstores import BaseVectorStore, ChromaVectorStore, InMemoryVectorStore + +__all__ = [ + # Document stores + "BaseDocumentStore", + "InMemoryDocumentStore", + # Vector stores + "BaseVectorStore", + "ChromaVectorStore", + "InMemoryVectorStore", +] diff --git a/knowledgehub/docstores/__init__.py b/knowledgehub/storages/docstores/__init__.py similarity index 100% rename from knowledgehub/docstores/__init__.py rename to knowledgehub/storages/docstores/__init__.py diff --git a/knowledgehub/docstores/base.py b/knowledgehub/storages/docstores/base.py similarity index 97% rename from knowledgehub/docstores/base.py rename to knowledgehub/storages/docstores/base.py index ce2a55d..e057f0b 100644 --- a/knowledgehub/docstores/base.py +++ b/knowledgehub/storages/docstores/base.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from pathlib import Path from typing import List, Optional, Union -from ..base import Document +from ...base import Document class BaseDocumentStore(ABC): diff --git a/knowledgehub/docstores/in_memory.py b/knowledgehub/storages/docstores/in_memory.py similarity index 98% rename from knowledgehub/docstores/in_memory.py rename to knowledgehub/storages/docstores/in_memory.py index 339d735..23f3e22 100644 --- a/knowledgehub/docstores/in_memory.py +++ b/knowledgehub/storages/docstores/in_memory.py @@ -2,7 +2,7 @@ import json from pathlib import Path from typing import List, Optional, Union -from ..base import Document +from ...base import Document from .base import BaseDocumentStore diff --git a/knowledgehub/vectorstores/__init__.py b/knowledgehub/storages/vectorstores/__init__.py similarity index 100% rename from knowledgehub/vectorstores/__init__.py rename to knowledgehub/storages/vectorstores/__init__.py diff --git a/knowledgehub/vectorstores/base.py b/knowledgehub/storages/vectorstores/base.py similarity index 99% rename from knowledgehub/vectorstores/base.py rename to knowledgehub/storages/vectorstores/base.py index 5df6792..1ddaef0 100644 --- a/knowledgehub/vectorstores/base.py +++ b/knowledgehub/storages/vectorstores/base.py @@ -6,7 +6,7 @@ from llama_index.vector_stores.types import BasePydanticVectorStore from llama_index.vector_stores.types import VectorStore as LIVectorStore from llama_index.vector_stores.types import VectorStoreQuery -from ..base import Document +from ...base import Document class BaseVectorStore(ABC): diff --git a/knowledgehub/vectorstores/chroma.py b/knowledgehub/storages/vectorstores/chroma.py similarity index 100% rename from knowledgehub/vectorstores/chroma.py rename to knowledgehub/storages/vectorstores/chroma.py diff --git a/knowledgehub/vectorstores/in_memory.py b/knowledgehub/storages/vectorstores/in_memory.py similarity index 96% rename from knowledgehub/vectorstores/in_memory.py rename to knowledgehub/storages/vectorstores/in_memory.py index 4bd8a5f..f8f20cc 100644 --- a/knowledgehub/vectorstores/in_memory.py +++ b/knowledgehub/storages/vectorstores/in_memory.py @@ -6,7 +6,7 @@ import fsspec from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore from llama_index.vector_stores.simple import SimpleVectorStoreData -from kotaemon.vectorstores.base import LlamaIndexVectorStore +from .base import LlamaIndexVectorStore class InMemoryVectorStore(LlamaIndexVectorStore): diff --git a/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/pipeline.py b/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/pipeline.py index 6cf4758..8b98b5d 100644 --- a/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/pipeline.py +++ b/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/pipeline.py @@ -5,12 +5,11 @@ from theflow import Param from theflow.utils.modules import ObjectInitDeclaration as _ from kotaemon.base import BaseComponent -from kotaemon.docstores import InMemoryDocumentStore from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.llms.completions.openai import AzureOpenAI from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline -from kotaemon.vectorstores import ChromaVectorStore +from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore class QuestionAnsweringPipeline(BaseComponent): diff --git a/tests/simple_pipeline.py b/tests/simple_pipeline.py index a490699..295eead 100644 --- a/tests/simple_pipeline.py +++ b/tests/simple_pipeline.py @@ -7,7 +7,7 @@ from kotaemon.base import BaseComponent from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.llms.completions.openai import AzureOpenAI from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline -from kotaemon.vectorstores import ChromaVectorStore +from kotaemon.storages import ChromaVectorStore class Pipeline(BaseComponent): diff --git a/tests/test_docstores.py b/tests/test_docstores.py index db1da9e..a6cb9a0 100644 --- a/tests/test_docstores.py +++ b/tests/test_docstores.py @@ -1,7 +1,7 @@ import pytest from kotaemon.base import Document -from kotaemon.docstores import InMemoryDocumentStore +from kotaemon.storages import InMemoryDocumentStore def test_simple_document_store_base_interfaces(tmp_path): diff --git a/tests/test_indexing_retrieval.py b/tests/test_indexing_retrieval.py index 6fb9ca6..cd0cb50 100644 --- a/tests/test_indexing_retrieval.py +++ b/tests/test_indexing_retrieval.py @@ -6,11 +6,10 @@ import pytest from openai.resources.embeddings import Embeddings from kotaemon.base import Document -from kotaemon.docstores import InMemoryDocumentStore from kotaemon.embeddings.openai import AzureOpenAIEmbeddings from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline -from kotaemon.vectorstores import ChromaVectorStore +from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: openai_embedding = json.load(f) diff --git a/tests/test_tools.py b/tests/test_tools.py index 2f336f5..9b9d0ec 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -5,12 +5,11 @@ import pytest from openai.resources.embeddings import Embeddings from kotaemon.base import Document -from kotaemon.docstores import InMemoryDocumentStore from kotaemon.embeddings.openai import AzureOpenAIEmbeddings from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline from kotaemon.pipelines.tools import ComponentTool, GoogleSearchTool, WikipediaTool -from kotaemon.vectorstores import ChromaVectorStore +from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: openai_embedding = json.load(f) diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py index 0e9a9d0..1a7ac65 100644 --- a/tests/test_vectorstore.py +++ b/tests/test_vectorstore.py @@ -1,7 +1,7 @@ import json from kotaemon.base import Document -from kotaemon.vectorstores import ChromaVectorStore, InMemoryVectorStore +from kotaemon.storages import ChromaVectorStore, InMemoryVectorStore class TestChromaVectorStore: