Add file-based document store and vector store (#96)

* Modify docstore and vectorstore objects to be reconstructable * Simplify the file docstore * Use the simple file docstore and vector store in MVP
2023-12-04 17:46:00 +07:00 · 2023-12-04 17:46:00 +07:00 · 37c744b616
commit 37c744b616
parent 0ce3a8832f
18 changed files with 324 additions and 149 deletions
--- a/knowledgehub/embeddings/langchain_based.py
+++ b/knowledgehub/embeddings/langchain_based.py
@ -73,10 +73,13 @@ class LCEmbeddingMixin:
            return self._kwargs[name]
        return getattr(self._obj, name)

-    def dump(self):
+    def dump(self, *args, **kwargs):
+        from theflow.utils.modules import serialize
+
+        params = {key: serialize(value) for key, value in self._kwargs.items()}
        return {
            "__type__": f"{self.__module__}.{self.__class__.__qualname__}",
-            **self._kwargs,
+            **params,
        }

    def specs(self, path: str):
--- a/knowledgehub/indices/base.py
+++ b/knowledgehub/indices/base.py
@ -82,10 +82,13 @@ class LlamaIndexDocTransformerMixin:
            return self._kwargs[name]
        return getattr(self._obj, name)

-    def dump(self):
+    def dump(self, *args, **kwargs):
+        from theflow.utils.modules import serialize
+
+        params = {key: serialize(value) for key, value in self._kwargs.items()}
        return {
            "__type__": f"{self.__module__}.{self.__class__.__qualname__}",
-            **self._kwargs,
+            **params,
        }

    def run(
--- a/knowledgehub/indices/vectorindex.py
+++ b/knowledgehub/indices/vectorindex.py
@ -1,7 +1,6 @@
 from __future__ import annotations

 import uuid
-from pathlib import Path
 from typing import Optional, Sequence, cast

 from kotaemon.base import BaseComponent, Document, RetrievedDocument
@ -68,37 +67,6 @@ class VectorIndexing(BaseIndexing):
        if self.doc_store:
            self.doc_store.add(input_)

-    def save(
-        self,
-        path: str | Path,
-        vectorstore_fname: str = VECTOR_STORE_FNAME,
-        docstore_fname: str = DOC_STORE_FNAME,
-    ):
-        """Save the whole state of the indexing pipeline vector store and all
-        necessary information to disk
-
-        Args:
-            path (str): path to save the state
-        """
-        if isinstance(path, str):
-            path = Path(path)
-        self.vector_store.save(path / vectorstore_fname)
-        if self.doc_store:
-            self.doc_store.save(path / docstore_fname)
-
-    def load(
-        self,
-        path: str | Path,
-        vectorstore_fname: str = VECTOR_STORE_FNAME,
-        docstore_fname: str = DOC_STORE_FNAME,
-    ):
-        """Load all information from disk to an object"""
-        if isinstance(path, str):
-            path = Path(path)
-        self.vector_store.load(path / vectorstore_fname)
-        if self.doc_store:
-            self.doc_store.load(path / docstore_fname)
-

 class VectorRetrieval(BaseRetrieval):
    """Retrieve list of documents from vector store"""
@ -144,37 +112,6 @@ class VectorRetrieval(BaseRetrieval):

        return result

-    def save(
-        self,
-        path: str | Path,
-        vectorstore_fname: str = VECTOR_STORE_FNAME,
-        docstore_fname: str = DOC_STORE_FNAME,
-    ):
-        """Save the whole state of the indexing pipeline vector store and all
-        necessary information to disk
-
-        Args:
-            path (str): path to save the state
-        """
-        if isinstance(path, str):
-            path = Path(path)
-        self.vector_store.save(path / vectorstore_fname)
-        if self.doc_store:
-            self.doc_store.save(path / docstore_fname)
-
-    def load(
-        self,
-        path: str | Path,
-        vectorstore_fname: str = VECTOR_STORE_FNAME,
-        docstore_fname: str = DOC_STORE_FNAME,
-    ):
-        """Load all information from disk to an object"""
-        if isinstance(path, str):
-            path = Path(path)
-        self.vector_store.load(path / vectorstore_fname)
-        if self.doc_store:
-            self.doc_store.load(path / docstore_fname)
-

 class TextVectorQA(BaseComponent):
    retrieving_pipeline: BaseRetrieval
--- a/knowledgehub/llms/chats/langchain_based.py
+++ b/knowledgehub/llms/chats/langchain_based.py
@ -101,10 +101,13 @@ class LCChatMixin:
            return self._kwargs[name]
        return getattr(self._obj, name)

-    def dump(self):
+    def dump(self, *args, **kwargs):
+        from theflow.utils.modules import serialize
+
+        params = {key: serialize(value) for key, value in self._kwargs.items()}
        return {
            "__type__": f"{self.__module__}.{self.__class__.__qualname__}",
-            **self._kwargs,
+            **params,
        }

    def specs(self, path: str):
--- a/knowledgehub/llms/completions/langchain_based.py
+++ b/knowledgehub/llms/completions/langchain_based.py
@ -78,10 +78,13 @@ class LCCompletionMixin:
            return self._kwargs[name]
        return getattr(self._obj, name)

-    def dump(self):
+    def dump(self, *args, **kwargs):
+        from theflow.utils.modules import serialize
+
+        params = {key: serialize(value) for key, value in self._kwargs.items()}
        return {
            "__type__": f"{self.__module__}.{self.__class__.__qualname__}",
-            **self._kwargs,
+            **params,
        }

    def specs(self, path: str):
--- a/knowledgehub/storages/init.py
+++ b/knowledgehub/storages/init.py
@ -2,16 +2,24 @@ from .docstores import (
    BaseDocumentStore,
    ElasticsearchDocumentStore,
    InMemoryDocumentStore,
+    SimpleFileDocumentStore,
+)
+from .vectorstores import (
+    BaseVectorStore,
+    ChromaVectorStore,
+    InMemoryVectorStore,
+    SimpleFileVectorStore,
 )
-from .vectorstores import BaseVectorStore, ChromaVectorStore, InMemoryVectorStore

 __all__ = [
    # Document stores
    "BaseDocumentStore",
    "InMemoryDocumentStore",
    "ElasticsearchDocumentStore",
+    "SimpleFileDocumentStore",
    # Vector stores
    "BaseVectorStore",
    "ChromaVectorStore",
    "InMemoryVectorStore",
+    "SimpleFileVectorStore",
 ]
--- a/knowledgehub/storages/docstores/init.py
+++ b/knowledgehub/storages/docstores/init.py
@ -1,5 +1,11 @@
 from .base import BaseDocumentStore
 from .elasticsearch import ElasticsearchDocumentStore
 from .in_memory import InMemoryDocumentStore
+from .simple_file import SimpleFileDocumentStore

-__all__ = ["BaseDocumentStore", "InMemoryDocumentStore", "ElasticsearchDocumentStore"]
+__all__ = [
+    "BaseDocumentStore",
+    "InMemoryDocumentStore",
+    "ElasticsearchDocumentStore",
+    "SimpleFileDocumentStore",
+]
--- a/knowledgehub/storages/docstores/base.py
+++ b/knowledgehub/storages/docstores/base.py
@ -1,8 +1,7 @@
 from abc import ABC, abstractmethod
-from pathlib import Path
 from typing import List, Optional, Union

-from ...base import Document
+from kotaemon.base import Document


 class BaseDocumentStore(ABC):
@ -46,13 +45,3 @@ class BaseDocumentStore(ABC):
    def delete(self, ids: Union[List[str], str]):
        """Delete document by id"""
        ...
-
-    @abstractmethod
-    def save(self, path: Union[str, Path]):
-        """Save document to path"""
-        ...
-
-    @abstractmethod
-    def load(self, path: Union[str, Path]):
-        """Load document store from path"""
-        ...
--- a/knowledgehub/storages/docstores/elasticsearch.py
+++ b/knowledgehub/storages/docstores/elasticsearch.py
@ -1,7 +1,7 @@
-from pathlib import Path
 from typing import List, Optional, Union

-from ...base import Document
+from kotaemon.base import Document
+
 from .base import BaseDocumentStore

 MAX_DOCS_TO_GET = 10**4
@ -27,6 +27,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):

        self.elasticsearch_url = elasticsearch_url
        self.index_name = index_name
+        self.k1 = k1
+        self.b = b

        # Create an Elasticsearch client instance
        self.client = Elasticsearch(elasticsearch_url)
@ -160,10 +162,10 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        self.client.delete_by_query(index=self.index_name, body=query)
        self.client.indices.refresh(index=self.index_name)

-    def save(self, path: Union[str, Path]):
-        """Save document to path"""
-        # not required for ElasticDocstore
-
-    def load(self, path: Union[str, Path]):
-        """Load document store from path"""
-        # not required for ElasticDocstore
+    def __persist_flow__(self):
+        return {
+            "index_name": self.index_name,
+            "elasticsearch_url": self.elasticsearch_url,
+            "k1": self.k1,
+            "b": self.b,
+        }
--- a/knowledgehub/storages/docstores/in_memory.py
+++ b/knowledgehub/storages/docstores/in_memory.py
@ -2,7 +2,8 @@ import json
 from pathlib import Path
 from typing import List, Optional, Union

-from ...base import Document
+from kotaemon.base import Document
+
 from .base import BaseDocumentStore


@ -74,3 +75,6 @@ class InMemoryDocumentStore(BaseDocumentStore):
        with open(path) as f:
            store = json.load(f)
        self._store = {key: Document.from_dict(value) for key, value in store.items()}
+
+    def __persist_flow__(self):
+        return {}
--- a/knowledgehub/storages/docstores/simple_file.py
+++ b/knowledgehub/storages/docstores/simple_file.py
@ -0,0 +1,44 @@
+from pathlib import Path
+from typing import List, Optional, Union
+
+from kotaemon.base import Document
+
+from .in_memory import InMemoryDocumentStore
+
+
+class SimpleFileDocumentStore(InMemoryDocumentStore):
+    """Improve InMemoryDocumentStore by auto saving whenever the corpus is changed"""
+
+    def __init__(self, path: str | Path):
+        super().__init__()
+        self._path = path
+        if path is not None and Path(path).is_file():
+            self.load(path)
+
+    def add(
+        self,
+        docs: Union[Document, List[Document]],
+        ids: Optional[Union[List[str], str]] = None,
+        **kwargs,
+    ):
+        """Add document into document store
+
+        Args:
+            docs: list of documents to add
+            ids: specify the ids of documents to add or
+                use existing doc.doc_id
+            exist_ok: raise error when duplicate doc-id
+                found in the docstore (default to False)
+        """
+        super().add(docs=docs, ids=ids, **kwargs)
+        self.save(self._path)
+
+    def delete(self, ids: Union[List[str], str]):
+        """Delete document by id"""
+        super().delete(ids=ids)
+        self.save(self._path)
+
+    def __persist_flow__(self):
+        from theflow.utils.modules import serialize
+
+        return {"path": serialize(self._path)}
--- a/knowledgehub/storages/vectorstores/init.py
+++ b/knowledgehub/storages/vectorstores/init.py
@ -1,5 +1,11 @@
 from .base import BaseVectorStore
 from .chroma import ChromaVectorStore
 from .in_memory import InMemoryVectorStore
+from .simple_file import SimpleFileVectorStore

-__all__ = ["BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore"]
+__all__ = [
+    "BaseVectorStore",
+    "ChromaVectorStore",
+    "InMemoryVectorStore",
+    "SimpleFileVectorStore",
+]
--- a/knowledgehub/storages/vectorstores/base.py
+++ b/knowledgehub/storages/vectorstores/base.py
@ -1,12 +1,14 @@
+from __future__ import annotations
+
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional, Tuple, Type, Union
+from typing import Any, Optional

 from llama_index.schema import NodeRelationship, RelatedNodeInfo
 from llama_index.vector_stores.types import BasePydanticVectorStore
 from llama_index.vector_stores.types import VectorStore as LIVectorStore
 from llama_index.vector_stores.types import VectorStoreQuery

-from kotaemon.base import Document, DocumentWithEmbedding
+from kotaemon.base import DocumentWithEmbedding


 class BaseVectorStore(ABC):
@ -17,10 +19,10 @@ class BaseVectorStore(ABC):
    @abstractmethod
    def add(
        self,
-        embeddings: List[List[float]] | List[DocumentWithEmbedding],
-        metadatas: Optional[List[dict]] = None,
-        ids: Optional[List[str]] = None,
-    ) -> List[str]:
+        embeddings: list[list[float]] | list[DocumentWithEmbedding],
+        metadatas: Optional[list[dict]] = None,
+        ids: Optional[list[str]] = None,
+    ) -> list[str]:
        """Add vector embeddings to vector stores

        Args:
@ -35,16 +37,7 @@ class BaseVectorStore(ABC):
        ...

    @abstractmethod
-    def add_from_docs(self, docs: List[Document]):
-        """Add vector embeddings to vector stores
-
-        Args:
-            docs: List of Document objects
-        """
-        ...
-
-    @abstractmethod
-    def delete(self, ids: List[str], **kwargs):
+    def delete(self, ids: list[str], **kwargs):
        """Delete vector embeddings from vector stores

        Args:
@ -56,11 +49,11 @@ class BaseVectorStore(ABC):
    @abstractmethod
    def query(
        self,
-        embedding: List[float],
+        embedding: list[float],
        top_k: int = 1,
-        ids: Optional[List[str]] = None,
+        ids: Optional[list[str]] = None,
        **kwargs,
-    ) -> Tuple[List[List[float]], List[float], List[str]]:
+    ) -> tuple[list[list[float]], list[float], list[str]]:
        """Return the top k most similar vector embeddings

        Args:
@ -73,17 +66,9 @@ class BaseVectorStore(ABC):
        """
        ...

-    @abstractmethod
-    def load(self, *args, **kwargs):
-        pass
-
-    @abstractmethod
-    def save(self, *args, **kwargs):
-        pass
-

 class LlamaIndexVectorStore(BaseVectorStore):
-    _li_class: Type[Union[LIVectorStore, BasePydanticVectorStore]]
+    _li_class: type[LIVectorStore | BasePydanticVectorStore]

    def __init__(self, *args, **kwargs):
        if self._li_class is None:
@ -104,12 +89,12 @@ class LlamaIndexVectorStore(BaseVectorStore):

    def add(
        self,
-        embeddings: List[List[float]] | List[DocumentWithEmbedding],
-        metadatas: Optional[List[dict]] = None,
-        ids: Optional[List[str]] = None,
+        embeddings: list[list[float]] | list[DocumentWithEmbedding],
+        metadatas: Optional[list[dict]] = None,
+        ids: Optional[list[str]] = None,
    ):
        if isinstance(embeddings[0], list):
-            nodes = [
+            nodes: list[DocumentWithEmbedding] = [
                DocumentWithEmbedding(embedding=embedding) for embedding in embeddings
            ]
        else:
@ -126,20 +111,17 @@ class LlamaIndexVectorStore(BaseVectorStore):

        return self._client.add(nodes=nodes)

-    def add_from_docs(self, docs: List[Document]):
-        return self._client.add(nodes=docs)
-
-    def delete(self, ids: List[str], **kwargs):
+    def delete(self, ids: list[str], **kwargs):
        for id_ in ids:
            self._client.delete(ref_doc_id=id_, **kwargs)

    def query(
        self,
-        embedding: List[float],
+        embedding: list[float],
        top_k: int = 1,
-        ids: Optional[List[str]] = None,
+        ids: Optional[list[str]] = None,
        **kwargs,
-    ) -> Tuple[List[List[float]], List[float], List[str]]:
+    ) -> tuple[list[list[float]], list[float], list[str]]:
        output = self._client.query(
            query=VectorStoreQuery(
                query_embedding=embedding,
--- a/knowledgehub/storages/vectorstores/chroma.py
+++ b/knowledgehub/storages/vectorstores/chroma.py
@ -21,6 +21,17 @@ class ChromaVectorStore(LlamaIndexVectorStore):
        flat_metadata: bool = True,
        **kwargs: Any,
    ):
+        self._path = path
+        self._collection_name = collection_name
+        self._host = host
+        self._port = port
+        self._ssl = ssl
+        self._headers = headers
+        self._collection_kwargs = collection_kwargs
+        self._stores_text = stores_text
+        self._flat_metadata = flat_metadata
+        self._kwargs = kwargs
+
        try:
            import chromadb
        except ImportError:
@ -70,8 +81,16 @@ class ChromaVectorStore(LlamaIndexVectorStore):
    def count(self) -> int:
        return self._collection.count()

-    def save(self, *args, **kwargs):
-        pass
-
-    def load(self, *args, **kwargs):
-        pass
+    def __persist_flow__(self):
+        return {
+            "path": self._path,
+            "collection_name": self._collection_name,
+            "host": self._host,
+            "port": self._port,
+            "ssl": self._ssl,
+            "headers": self._headers,
+            "collection_kwargs": self._collection_kwargs,
+            "stores_text": self._stores_text,
+            "flat_metadata": self._flat_metadata,
+            **self._kwargs,
+        }
--- a/knowledgehub/storages/vectorstores/in_memory.py
+++ b/knowledgehub/storages/vectorstores/in_memory.py
@ -1,5 +1,4 @@
 """Simple vector store index."""
-
 from typing import Any, Optional, Type

 import fsspec
@ -53,3 +52,11 @@ class InMemoryVectorStore(LlamaIndexVectorStore):
            fs: An abstract super-class for pythonic file-systems
        """
        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)
+
+    def __persist_flow__(self):
+        d = self._data.to_dict()
+        d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
+        return {
+            "data": d,
+            # "fs": self._fs,
+        }
--- a/knowledgehub/storages/vectorstores/simple_file.py
+++ b/knowledgehub/storages/vectorstores/simple_file.py
@ -0,0 +1,66 @@
+"""Simple file vector store index."""
+from pathlib import Path
+from typing import Any, Optional, Type
+
+import fsspec
+from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore
+from llama_index.vector_stores.simple import SimpleVectorStoreData
+
+from kotaemon.base import DocumentWithEmbedding
+
+from .base import LlamaIndexVectorStore
+
+
+class SimpleFileVectorStore(LlamaIndexVectorStore):
+    """Similar to InMemoryVectorStore but is backed by file by default"""
+
+    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
+    store_text: bool = False
+
+    def __init__(
+        self,
+        path: str | Path,
+        data: Optional[SimpleVectorStoreData] = None,
+        fs: Optional[fsspec.AbstractFileSystem] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize params."""
+        self._data = data or SimpleVectorStoreData()
+        self._fs = fs or fsspec.filesystem("file")
+        self._path = path
+        self._save_path = Path(path)
+
+        super().__init__(
+            data=data,
+            fs=fs,
+            **kwargs,
+        )
+
+        if self._save_path.is_file():
+            self._client = self._li_class.from_persist_path(
+                persist_path=str(self._save_path), fs=self._fs
+            )
+
+    def add(
+        self,
+        embeddings: list[list[float]] | list[DocumentWithEmbedding],
+        metadatas: Optional[list[dict]] = None,
+        ids: Optional[list[str]] = None,
+    ):
+        r = super().add(embeddings, metadatas, ids)
+        self._client.persist(str(self._save_path), self._fs)
+        return r
+
+    def delete(self, ids: list[str], **kwargs):
+        r = super().delete(ids, **kwargs)
+        self._client.persist(str(self._save_path), self._fs)
+        return r
+
+    def __persist_flow__(self):
+        d = self._data.to_dict()
+        d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
+        return {
+            "data": d,
+            "path": str(self._path),
+            # "fs": self._fs,
+        }
--- a/tests/test_docstores.py
+++ b/tests/test_docstores.py
@ -1,10 +1,15 @@
+import os
 from unittest.mock import patch

 import pytest
 from elastic_transport import ApiResponseMeta

 from kotaemon.base import Document
-from kotaemon.storages import ElasticsearchDocumentStore, InMemoryDocumentStore
+from kotaemon.storages import (
+    ElasticsearchDocumentStore,
+    InMemoryDocumentStore,
+    SimpleFileDocumentStore,
+)

 meta_success = ApiResponseMeta(
    status=200,
@ -207,7 +212,7 @@ _elastic_search_responses = [
 ]


-def test_simple_document_store_base_interfaces(tmp_path):
+def test_inmemory_document_store_base_interfaces(tmp_path):
    """Test all interfaces of a a document store"""

    store = InMemoryDocumentStore()
@ -260,6 +265,64 @@ def test_simple_document_store_base_interfaces(tmp_path):
    store2.load(tmp_path / "store.json")
    assert len(store2.get_all()) == 17, "Laded document store should have 17 documents"

+    os.remove(tmp_path / "store.json")
+
+
+def test_simplefile_document_store_base_interfaces(tmp_path):
+    """Test all interfaces of a a document store"""
+
+    path = tmp_path / "store.json"
+
+    store = SimpleFileDocumentStore(path=path)
+    docs = [
+        Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"})
+        for idx in range(10)
+    ]
+
+    # Test add and get all
+    assert len(store.get_all()) == 0, "Document store should be empty"
+    store.add(docs)
+    assert len(store.get_all()) == 10, "Document store should have 10 documents"
+
+    # Test add with provided ids
+    store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)])
+    assert len(store.get_all()) == 20, "Document store should have 20 documents"
+
+    # Test add without exist_ok
+    with pytest.raises(ValueError):
+        store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)])
+
+    # Update ok with add exist_ok
+    store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)], exist_ok=True)
+    assert len(store.get_all()) == 20, "Document store should have 20 documents"
+
+    # Test get with str id
+    matched = store.get(docs[0].doc_id)
+    assert len(matched) == 1, "Should return 1 document"
+    assert matched[0].text == docs[0].text, "Should return the correct document"
+
+    # Test get with list of ids
+    matched = store.get([docs[0].doc_id, docs[1].doc_id])
+    assert len(matched) == 2, "Should return 2 documents"
+    assert [doc.text for doc in matched] == [doc.text for doc in docs[:2]]
+
+    # Test delete with str id
+    store.delete(docs[0].doc_id)
+    assert len(store.get_all()) == 19, "Document store should have 19 documents"
+
+    # Test delete with list of ids
+    store.delete([docs[1].doc_id, docs[2].doc_id])
+    assert len(store.get_all()) == 17, "Document store should have 17 documents"
+
+    # Test save
+    assert path.exists(), "File should exist"
+
+    # Test load
+    store2 = SimpleFileDocumentStore(path=path)
+    assert len(store2.get_all()) == 17, "Laded document store should have 17 documents"
+
+    os.remove(path)
+

@patch(
    "elastic_transport.Transport.perform_request",
--- a/tests/test_vectorstore.py
+++ b/tests/test_vectorstore.py
@ -1,7 +1,12 @@
 import json
+import os

-from kotaemon.base import Document
-from kotaemon.storages import ChromaVectorStore, InMemoryVectorStore
+from kotaemon.base import DocumentWithEmbedding
+from kotaemon.storages import (
+    ChromaVectorStore,
+    InMemoryVectorStore,
+    SimpleFileVectorStore,
+)


 class TestChromaVectorStore:
@ -24,11 +29,11 @@ class TestChromaVectorStore:
        embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
        metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
        documents = [
-            Document(embedding=embedding, metadata=metadata)
+            DocumentWithEmbedding(embedding=embedding, metadata=metadata)
            for embedding, metadata in zip(embeddings, metadatas)
        ]
        assert db._collection.count() == 0, "Expected empty collection"
-        output = db.add_from_docs(documents)
+        output = db.add(documents)
        assert len(output) == 2, "Expected outputing 2 ids"
        assert db._collection.count() == 2, "Expected 2 added entries"

@ -69,10 +74,8 @@ class TestChromaVectorStore:
        ids = ["1", "2", "3"]
        db = ChromaVectorStore(path=str(tmp_path))
        db.add(embeddings=embeddings, metadatas=metadatas, ids=ids)
-        db.save()

        db2 = ChromaVectorStore(path=str(tmp_path))
-        db2.load()
        assert (
            db2._collection.count() == 3
        ), "load function does not load data completely"
@ -122,3 +125,30 @@ class TestInMemoryVectorStore:
            0.5,
            0.6,
        ], "load function does not load data completely"
+
+
+class TestSimpleFileVectorStore:
+    def test_add_delete(self, tmp_path):
+        """Test that delete func deletes correctly."""
+        embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]
+        metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}]
+        ids = ["1", "2", "3"]
+        db = SimpleFileVectorStore(path=tmp_path / "test_save_load_delete.json")
+        db.add(embeddings=embeddings, metadatas=metadatas, ids=ids)
+        db.delete(["3"])
+        f = open(tmp_path / "test_save_load_delete.json")
+        data = json.load(f)
+        assert (
+            "1" and "2" in data["text_id_to_ref_doc_id"]
+        ), "save function does not save data completely"
+        assert (
+            "3" not in data["text_id_to_ref_doc_id"]
+        ), "delete function does not delete data completely"
+        db2 = SimpleFileVectorStore(path=tmp_path / "test_save_load_delete.json")
+        assert db2.get("2") == [
+            0.4,
+            0.5,
+            0.6,
+        ], "load function does not load data completely"
+
+        os.remove(tmp_path / "test_save_load_delete.json")