* Modify docstore and vectorstore objects to be reconstructable * Simplify the file docstore * Use the simple file docstore and vector store in MVP
142 lines
4.3 KiB
Python
142 lines
4.3 KiB
Python
from __future__ import annotations
|
|
|
|
from abc import ABC, abstractmethod
|
|
from typing import Any, Optional
|
|
|
|
from llama_index.schema import NodeRelationship, RelatedNodeInfo
|
|
from llama_index.vector_stores.types import BasePydanticVectorStore
|
|
from llama_index.vector_stores.types import VectorStore as LIVectorStore
|
|
from llama_index.vector_stores.types import VectorStoreQuery
|
|
|
|
from kotaemon.base import DocumentWithEmbedding
|
|
|
|
|
|
class BaseVectorStore(ABC):
|
|
@abstractmethod
|
|
def __init__(self, *args, **kwargs):
|
|
...
|
|
|
|
@abstractmethod
|
|
def add(
|
|
self,
|
|
embeddings: list[list[float]] | list[DocumentWithEmbedding],
|
|
metadatas: Optional[list[dict]] = None,
|
|
ids: Optional[list[str]] = None,
|
|
) -> list[str]:
|
|
"""Add vector embeddings to vector stores
|
|
|
|
Args:
|
|
embeddings: List of embeddings
|
|
metadatas: List of metadata of the embeddings
|
|
ids: List of ids of the embeddings
|
|
kwargs: meant for vectorstore-specific parameters
|
|
|
|
Returns:
|
|
List of ids of the embeddings
|
|
"""
|
|
...
|
|
|
|
@abstractmethod
|
|
def delete(self, ids: list[str], **kwargs):
|
|
"""Delete vector embeddings from vector stores
|
|
|
|
Args:
|
|
ids: List of ids of the embeddings to be deleted
|
|
kwargs: meant for vectorstore-specific parameters
|
|
"""
|
|
...
|
|
|
|
@abstractmethod
|
|
def query(
|
|
self,
|
|
embedding: list[float],
|
|
top_k: int = 1,
|
|
ids: Optional[list[str]] = None,
|
|
**kwargs,
|
|
) -> tuple[list[list[float]], list[float], list[str]]:
|
|
"""Return the top k most similar vector embeddings
|
|
|
|
Args:
|
|
embedding: List of embeddings
|
|
top_k: Number of most similar embeddings to return
|
|
ids: List of ids of the embeddings to be queried
|
|
|
|
Returns:
|
|
the matched embeddings, the similarity scores, and the ids
|
|
"""
|
|
...
|
|
|
|
|
|
class LlamaIndexVectorStore(BaseVectorStore):
|
|
_li_class: type[LIVectorStore | BasePydanticVectorStore]
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
if self._li_class is None:
|
|
raise AttributeError(
|
|
"Require `_li_class` to set a VectorStore class from LlamarIndex"
|
|
)
|
|
|
|
self._client = self._li_class(*args, **kwargs)
|
|
|
|
def __setattr__(self, name: str, value: Any) -> None:
|
|
if name.startswith("_"):
|
|
return super().__setattr__(name, value)
|
|
|
|
return setattr(self._client, name, value)
|
|
|
|
def __getattr__(self, name: str) -> Any:
|
|
return getattr(self._client, name)
|
|
|
|
def add(
|
|
self,
|
|
embeddings: list[list[float]] | list[DocumentWithEmbedding],
|
|
metadatas: Optional[list[dict]] = None,
|
|
ids: Optional[list[str]] = None,
|
|
):
|
|
if isinstance(embeddings[0], list):
|
|
nodes: list[DocumentWithEmbedding] = [
|
|
DocumentWithEmbedding(embedding=embedding) for embedding in embeddings
|
|
]
|
|
else:
|
|
nodes = embeddings # type: ignore
|
|
if metadatas is not None:
|
|
for node, metadata in zip(nodes, metadatas):
|
|
node.metadata = metadata
|
|
if ids is not None:
|
|
for node, id in zip(nodes, ids):
|
|
node.id_ = id
|
|
node.relationships = {
|
|
NodeRelationship.SOURCE: RelatedNodeInfo(node_id=id)
|
|
}
|
|
|
|
return self._client.add(nodes=nodes)
|
|
|
|
def delete(self, ids: list[str], **kwargs):
|
|
for id_ in ids:
|
|
self._client.delete(ref_doc_id=id_, **kwargs)
|
|
|
|
def query(
|
|
self,
|
|
embedding: list[float],
|
|
top_k: int = 1,
|
|
ids: Optional[list[str]] = None,
|
|
**kwargs,
|
|
) -> tuple[list[list[float]], list[float], list[str]]:
|
|
output = self._client.query(
|
|
query=VectorStoreQuery(
|
|
query_embedding=embedding,
|
|
similarity_top_k=top_k,
|
|
node_ids=ids,
|
|
**kwargs,
|
|
),
|
|
)
|
|
|
|
embeddings = []
|
|
if output.nodes:
|
|
for node in output.nodes:
|
|
embeddings.append(node.embedding)
|
|
similarities = output.similarities if output.similarities else []
|
|
out_ids = output.ids if output.ids else []
|
|
|
|
return embeddings, similarities, out_ids
|