Combine docstores and vectorstores within a storages component (#72)
This commit is contained in:
committed by
GitHub
parent
640962e916
commit
b159897ac6
12
knowledgehub/storages/__init__.py
Normal file
12
knowledgehub/storages/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from .docstores import BaseDocumentStore, InMemoryDocumentStore
|
||||
from .vectorstores import BaseVectorStore, ChromaVectorStore, InMemoryVectorStore
|
||||
|
||||
__all__ = [
|
||||
# Document stores
|
||||
"BaseDocumentStore",
|
||||
"InMemoryDocumentStore",
|
||||
# Vector stores
|
||||
"BaseVectorStore",
|
||||
"ChromaVectorStore",
|
||||
"InMemoryVectorStore",
|
||||
]
|
4
knowledgehub/storages/docstores/__init__.py
Normal file
4
knowledgehub/storages/docstores/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .base import BaseDocumentStore
|
||||
from .in_memory import InMemoryDocumentStore
|
||||
|
||||
__all__ = ["BaseDocumentStore", "InMemoryDocumentStore"]
|
54
knowledgehub/storages/docstores/base.py
Normal file
54
knowledgehub/storages/docstores/base.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...base import Document
|
||||
|
||||
|
||||
class BaseDocumentStore(ABC):
|
||||
"""A document store is in charged of storing and managing documents"""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, *args, **kwargs):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def add(
|
||||
self,
|
||||
docs: Union[Document, List[Document]],
|
||||
ids: Optional[Union[List[str], str]] = None,
|
||||
exist_ok: bool = False,
|
||||
):
|
||||
"""Add document into document store
|
||||
|
||||
Args:
|
||||
docs: Document or list of documents
|
||||
ids: List of ids of the documents. Optional, if not set will use doc.doc_id
|
||||
exist_ok: If True, will not raise error if document already exist
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def get(self, ids: Union[List[str], str]) -> List[Document]:
|
||||
"""Get document by id"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def get_all(self) -> dict:
|
||||
"""Get all documents"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def delete(self, ids: Union[List[str], str]):
|
||||
"""Delete document by id"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def save(self, path: Union[str, Path]):
|
||||
"""Save document to path"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def load(self, path: Union[str, Path]):
|
||||
"""Load document store from path"""
|
||||
...
|
68
knowledgehub/storages/docstores/in_memory.py
Normal file
68
knowledgehub/storages/docstores/in_memory.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...base import Document
|
||||
from .base import BaseDocumentStore
|
||||
|
||||
|
||||
class InMemoryDocumentStore(BaseDocumentStore):
|
||||
"""Simple memory document store that store document in a dictionary"""
|
||||
|
||||
def __init__(self):
|
||||
self._store = {}
|
||||
|
||||
def add(
|
||||
self,
|
||||
docs: Union[Document, List[Document]],
|
||||
ids: Optional[Union[List[str], str]] = None,
|
||||
exist_ok: bool = False,
|
||||
):
|
||||
"""Add document into document store
|
||||
|
||||
Args:
|
||||
docs: Union[Document, List[Document]],
|
||||
ids: Optional[Union[List[str], str]] = None,
|
||||
"""
|
||||
doc_ids = ids if ids else [doc.doc_id for doc in docs]
|
||||
if not isinstance(doc_ids, list):
|
||||
doc_ids = [doc_ids]
|
||||
|
||||
if not isinstance(docs, list):
|
||||
docs = [docs]
|
||||
|
||||
for doc_id, doc in zip(doc_ids, docs):
|
||||
if doc_id in self._store and not exist_ok:
|
||||
raise ValueError(f"Document with id {doc_id} already exist")
|
||||
self._store[doc_id] = doc
|
||||
|
||||
def get(self, ids: Union[List[str], str]) -> List[Document]:
|
||||
"""Get document by id"""
|
||||
if not isinstance(ids, list):
|
||||
ids = [ids]
|
||||
|
||||
return [self._store[doc_id] for doc_id in ids]
|
||||
|
||||
def get_all(self) -> dict:
|
||||
"""Get all documents"""
|
||||
return self._store
|
||||
|
||||
def delete(self, ids: Union[List[str], str]):
|
||||
"""Delete document by id"""
|
||||
if not isinstance(ids, list):
|
||||
ids = [ids]
|
||||
|
||||
for doc_id in ids:
|
||||
del self._store[doc_id]
|
||||
|
||||
def save(self, path: Union[str, Path]):
|
||||
"""Save document to path"""
|
||||
store = {key: value.to_dict() for key, value in self._store.items()}
|
||||
with open(path, "w") as f:
|
||||
json.dump(store, f)
|
||||
|
||||
def load(self, path: Union[str, Path]):
|
||||
"""Load document store from path"""
|
||||
with open(path) as f:
|
||||
store = json.load(f)
|
||||
self._store = {key: Document.from_dict(value) for key, value in store.items()}
|
5
knowledgehub/storages/vectorstores/__init__.py
Normal file
5
knowledgehub/storages/vectorstores/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .base import BaseVectorStore
|
||||
from .chroma import ChromaVectorStore
|
||||
from .in_memory import InMemoryVectorStore
|
||||
|
||||
__all__ = ["BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore"]
|
154
knowledgehub/storages/vectorstores/base.py
Normal file
154
knowledgehub/storages/vectorstores/base.py
Normal file
@@ -0,0 +1,154 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, List, Optional, Tuple, Type, Union
|
||||
|
||||
from llama_index.schema import NodeRelationship, RelatedNodeInfo
|
||||
from llama_index.vector_stores.types import BasePydanticVectorStore
|
||||
from llama_index.vector_stores.types import VectorStore as LIVectorStore
|
||||
from llama_index.vector_stores.types import VectorStoreQuery
|
||||
|
||||
from ...base import Document
|
||||
|
||||
|
||||
class BaseVectorStore(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, *args, **kwargs):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def add(
|
||||
self,
|
||||
embeddings: List[List[float]],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
) -> List[str]:
|
||||
"""Add vector embeddings to vector stores
|
||||
|
||||
Args:
|
||||
embeddings: List of embeddings
|
||||
metadatas: List of metadata of the embeddings
|
||||
ids: List of ids of the embeddings
|
||||
kwargs: meant for vectorstore-specific parameters
|
||||
|
||||
Returns:
|
||||
List of ids of the embeddings
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def add_from_docs(self, docs: List[Document]):
|
||||
"""Add vector embeddings to vector stores
|
||||
|
||||
Args:
|
||||
docs: List of Document objects
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def delete(self, ids: List[str], **kwargs):
|
||||
"""Delete vector embeddings from vector stores
|
||||
|
||||
Args:
|
||||
ids: List of ids of the embeddings to be deleted
|
||||
kwargs: meant for vectorstore-specific parameters
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def query(
|
||||
self,
|
||||
embedding: List[float],
|
||||
top_k: int = 1,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[List[List[float]], List[float], List[str]]:
|
||||
"""Return the top k most similar vector embeddings
|
||||
|
||||
Args:
|
||||
embedding: List of embeddings
|
||||
top_k: Number of most similar embeddings to return
|
||||
ids: List of ids of the embeddings to be queried
|
||||
|
||||
Returns:
|
||||
the matched embeddings, the similarity scores, and the ids
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def load(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class LlamaIndexVectorStore(BaseVectorStore):
|
||||
_li_class: Type[Union[LIVectorStore, BasePydanticVectorStore]]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if self._li_class is None:
|
||||
raise AttributeError(
|
||||
"Require `_li_class` to set a VectorStore class from LlamarIndex"
|
||||
)
|
||||
|
||||
self._client = self._li_class(*args, **kwargs)
|
||||
|
||||
def __setattr__(self, name: str, value: Any) -> None:
|
||||
if name.startswith("_"):
|
||||
return super().__setattr__(name, value)
|
||||
|
||||
return setattr(self._client, name, value)
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
return getattr(self._client, name)
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: List[List[float]],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
):
|
||||
nodes = [Document(embedding=embedding) for embedding in embeddings]
|
||||
if metadatas is not None:
|
||||
for node, metadata in zip(nodes, metadatas):
|
||||
node.metadata = metadata
|
||||
if ids is not None:
|
||||
for node, id in zip(nodes, ids):
|
||||
node.id_ = id
|
||||
node.relationships = {
|
||||
NodeRelationship.SOURCE: RelatedNodeInfo(node_id=id)
|
||||
}
|
||||
|
||||
return self._client.add(nodes=nodes) # type: ignore
|
||||
|
||||
def add_from_docs(self, docs: List[Document]):
|
||||
return self._client.add(nodes=docs) # type: ignore
|
||||
|
||||
def delete(self, ids: List[str], **kwargs):
|
||||
for id_ in ids:
|
||||
self._client.delete(ref_doc_id=id_, **kwargs)
|
||||
|
||||
def query(
|
||||
self,
|
||||
embedding: List[float],
|
||||
top_k: int = 1,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[List[List[float]], List[float], List[str]]:
|
||||
output = self._client.query(
|
||||
query=VectorStoreQuery(
|
||||
query_embedding=embedding,
|
||||
similarity_top_k=top_k,
|
||||
node_ids=ids,
|
||||
**kwargs,
|
||||
),
|
||||
)
|
||||
|
||||
embeddings = []
|
||||
if output.nodes:
|
||||
for node in output.nodes:
|
||||
embeddings.append(node.embedding)
|
||||
similarities = output.similarities if output.similarities else []
|
||||
out_ids = output.ids if output.ids else []
|
||||
|
||||
return embeddings, similarities, out_ids
|
77
knowledgehub/storages/vectorstores/chroma.py
Normal file
77
knowledgehub/storages/vectorstores/chroma.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from typing import Any, Dict, List, Optional, Type, cast
|
||||
|
||||
from llama_index.vector_stores.chroma import ChromaVectorStore as LIChromaVectorStore
|
||||
|
||||
from .base import LlamaIndexVectorStore
|
||||
|
||||
|
||||
class ChromaVectorStore(LlamaIndexVectorStore):
|
||||
_li_class: Type[LIChromaVectorStore] = LIChromaVectorStore
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: str = "./chroma",
|
||||
collection_name: str = "default",
|
||||
host: str = "localhost",
|
||||
port: str = "8000",
|
||||
ssl: bool = False,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
collection_kwargs: Optional[dict] = None,
|
||||
stores_text: bool = True,
|
||||
flat_metadata: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
try:
|
||||
import chromadb
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"ChromaVectorStore requires chromadb. "
|
||||
"Please install chromadb first `pip install chromadb`"
|
||||
)
|
||||
|
||||
client = chromadb.PersistentClient(path=path)
|
||||
collection = client.get_or_create_collection(collection_name)
|
||||
|
||||
# pass through for nice IDE support
|
||||
super().__init__(
|
||||
chroma_collection=collection,
|
||||
host=host,
|
||||
port=port,
|
||||
ssl=ssl,
|
||||
headers=headers or {},
|
||||
collection_kwargs=collection_kwargs or {},
|
||||
stores_text=stores_text,
|
||||
flat_metadata=flat_metadata,
|
||||
**kwargs,
|
||||
)
|
||||
self._client = cast(LIChromaVectorStore, self._client)
|
||||
|
||||
def delete(self, ids: List[str], **kwargs):
|
||||
"""Delete vector embeddings from vector stores
|
||||
|
||||
Args:
|
||||
ids: List of ids of the embeddings to be deleted
|
||||
kwargs: meant for vectorstore-specific parameters
|
||||
"""
|
||||
self._client._collection.delete(ids=ids)
|
||||
|
||||
def delete_collection(self, collection_name: Optional[str] = None):
|
||||
"""Delete entire collection under specified name from vector stores
|
||||
|
||||
Args:
|
||||
collection_name: Name of the collection to delete
|
||||
"""
|
||||
# a rather ugly chain call but it do the job of finding
|
||||
# original chromadb client and call delete_collection() method
|
||||
if collection_name is None:
|
||||
collection_name = self._client.client.name
|
||||
self._client.client._client.delete_collection(collection_name)
|
||||
|
||||
def count(self) -> int:
|
||||
return self._collection.count()
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def load(self, *args, **kwargs):
|
||||
pass
|
55
knowledgehub/storages/vectorstores/in_memory.py
Normal file
55
knowledgehub/storages/vectorstores/in_memory.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Simple vector store index."""
|
||||
|
||||
from typing import Any, Optional, Type
|
||||
|
||||
import fsspec
|
||||
from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore
|
||||
from llama_index.vector_stores.simple import SimpleVectorStoreData
|
||||
|
||||
from .base import LlamaIndexVectorStore
|
||||
|
||||
|
||||
class InMemoryVectorStore(LlamaIndexVectorStore):
|
||||
_li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
|
||||
store_text: bool = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
data: Optional[SimpleVectorStoreData] = None,
|
||||
fs: Optional[fsspec.AbstractFileSystem] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize params."""
|
||||
self._data = data or SimpleVectorStoreData()
|
||||
self._fs = fs or fsspec.filesystem("file")
|
||||
|
||||
super().__init__(
|
||||
data=data,
|
||||
fs=fs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def save(
|
||||
self,
|
||||
save_path: str,
|
||||
fs: Optional[fsspec.AbstractFileSystem] = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
"""save a simpleVectorStore to a dictionary.
|
||||
|
||||
Args:
|
||||
save_path: Path of saving vector to disk.
|
||||
fs: An abstract super-class for pythonic file-systems
|
||||
"""
|
||||
self._client.persist(persist_path=save_path, fs=fs)
|
||||
|
||||
def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):
|
||||
|
||||
"""Create a SimpleKVStore from a load directory.
|
||||
|
||||
Args:
|
||||
load_path: Path of loading vector.
|
||||
fs: An abstract super-class for pythonic file-systems
|
||||
"""
|
||||
self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)
|
Reference in New Issue
Block a user