[AUR-362] Add In-memory vector store (#22)

* [AUR-362] Add In-memory vector store

* [AUR-362] fix delete fun input format

* [AUR-362] revise persist and from persist path to save and load

* [AUR-362] revise simple.py to in_memory.py
This commit is contained in:
cin-jacky 2023-09-20 17:51:50 +09:00 committed by GitHub
parent b794051653
commit c329c4c03f
4 changed files with 104 additions and 2 deletions

View File

@ -1,4 +1,5 @@
from .base import BaseVectorStore
from .chroma import ChromaVectorStore
from .in_memory import InMemoryVectorStore
__all__ = ["BaseVectorStore", "ChromaVectorStore"]
__all__ = ["BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore"]

View File

@ -1,6 +1,7 @@
from abc import ABC, abstractmethod
from typing import Any, List, Optional, Tuple, Type, Union
from llama_index.schema import NodeRelationship, RelatedNodeInfo
from llama_index.vector_stores.types import BasePydanticVectorStore
from llama_index.vector_stores.types import VectorStore as LIVectorStore
from llama_index.vector_stores.types import VectorStoreQuery
@ -118,6 +119,9 @@ class LlamaIndexVectorStore(BaseVectorStore):
if ids is not None:
for node, id in zip(nodes, ids):
node.id_ = id
node.relationships = {
NodeRelationship.SOURCE: RelatedNodeInfo(node_id=id)
}
return self._client.add(nodes=nodes) # type: ignore

View File

@ -0,0 +1,57 @@
"""Simple vector store index."""
from typing import Any, Optional, Type
import fsspec
from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore
from llama_index.vector_stores.simple import SimpleVectorStoreData
from kotaemon.vectorstores.base import LlamaIndexVectorStore
class InMemoryVectorStore(LlamaIndexVectorStore):
_li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
store_text: bool = False
def __init__(
self,
data: Optional[SimpleVectorStoreData] = None,
fs: Optional[fsspec.AbstractFileSystem] = None,
**kwargs: Any,
) -> None:
"""Initialize params."""
self._data = data or SimpleVectorStoreData()
self._fs = fs or fsspec.filesystem("file")
super().__init__(
data=data,
fs=fs,
**kwargs,
)
def save(
self,
save_path: str,
fs: Optional[fsspec.AbstractFileSystem] = None,
**kwargs,
):
"""save a simpleVectorStore to a dictionary.
Args:
save_path: Path of saving vector to disk.
fs: An abstract super-class for pythonic file-systems
"""
self._client.persist(persist_path=save_path, fs=fs)
def load(
self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
) -> "InMemoryVectorStore":
"""Create a SimpleKVStore from a load directory.
Args:
load_path: Path of loading vector.
fs: An abstract super-class for pythonic file-systems
"""
return self._client.from_persist_path(persist_path=load_path, fs=fs)

View File

@ -1,5 +1,7 @@
import json
from kotaemon.documents.base import Document
from kotaemon.vectorstores import ChromaVectorStore
from kotaemon.vectorstores import ChromaVectorStore, InMemoryVectorStore
class TestChromaVectorStore:
@ -59,3 +61,41 @@ class TestChromaVectorStore:
_, _, out_ids = db.query(embedding=[0.42, 0.52, 0.53], top_k=1)
assert out_ids == ["b"]
class TestSimpleVectorStore:
def test_add(self):
"""Test that add func adds correctly."""
embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
ids = ["1", "2"]
db = InMemoryVectorStore()
output = db.add(embeddings=embeddings, metadatas=metadatas, ids=ids)
assert output == ids, "Excepted output to be the same as ids"
def test_save_load_delete(self, tmp_path):
"""Test that delete func deletes correctly."""
embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]
metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}]
ids = ["1", "2", "3"]
db = InMemoryVectorStore()
db.add(embeddings=embeddings, metadatas=metadatas, ids=ids)
db.delete(["3"])
db.save(save_path=tmp_path / "test_save_load_delete.json")
f = open(tmp_path / "test_save_load_delete.json")
data = json.load(f)
assert (
"1" and "2" in data["text_id_to_ref_doc_id"]
), "persist function does not save data completely"
assert (
"3" not in data["text_id_to_ref_doc_id"]
), "delete function does not delete data completely"
db2 = InMemoryVectorStore()
output = db2.load(load_path=tmp_path / "test_save_load_delete.json")
assert output.get("2") == [
0.4,
0.5,
0.6,
], "persist function does not load data completely"