Add file-based document store and vector store (#96)
* Modify docstore and vectorstore objects to be reconstructable * Simplify the file docstore * Use the simple file docstore and vector store in MVP
This commit is contained in:
committed by
GitHub
parent
0ce3a8832f
commit
37c744b616
@@ -2,16 +2,24 @@ from .docstores import (
|
||||
BaseDocumentStore,
|
||||
ElasticsearchDocumentStore,
|
||||
InMemoryDocumentStore,
|
||||
SimpleFileDocumentStore,
|
||||
)
|
||||
from .vectorstores import (
|
||||
BaseVectorStore,
|
||||
ChromaVectorStore,
|
||||
InMemoryVectorStore,
|
||||
SimpleFileVectorStore,
|
||||
)
|
||||
from .vectorstores import BaseVectorStore, ChromaVectorStore, InMemoryVectorStore
|
||||
|
||||
__all__ = [
|
||||
# Document stores
|
||||
"BaseDocumentStore",
|
||||
"InMemoryDocumentStore",
|
||||
"ElasticsearchDocumentStore",
|
||||
"SimpleFileDocumentStore",
|
||||
# Vector stores
|
||||
"BaseVectorStore",
|
||||
"ChromaVectorStore",
|
||||
"InMemoryVectorStore",
|
||||
"SimpleFileVectorStore",
|
||||
]
|
||||
|
@@ -1,5 +1,11 @@
|
||||
from .base import BaseDocumentStore
|
||||
from .elasticsearch import ElasticsearchDocumentStore
|
||||
from .in_memory import InMemoryDocumentStore
|
||||
from .simple_file import SimpleFileDocumentStore
|
||||
|
||||
__all__ = ["BaseDocumentStore", "InMemoryDocumentStore", "ElasticsearchDocumentStore"]
|
||||
__all__ = [
|
||||
"BaseDocumentStore",
|
||||
"InMemoryDocumentStore",
|
||||
"ElasticsearchDocumentStore",
|
||||
"SimpleFileDocumentStore",
|
||||
]
|
||||
|
@@ -1,8 +1,7 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...base import Document
|
||||
from kotaemon.base import Document
|
||||
|
||||
|
||||
class BaseDocumentStore(ABC):
|
||||
@@ -46,13 +45,3 @@ class BaseDocumentStore(ABC):
|
||||
def delete(self, ids: Union[List[str], str]):
|
||||
"""Delete document by id"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def save(self, path: Union[str, Path]):
|
||||
"""Save document to path"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def load(self, path: Union[str, Path]):
|
||||
"""Load document store from path"""
|
||||
...
|
||||
|
@@ -1,7 +1,7 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...base import Document
|
||||
from kotaemon.base import Document
|
||||
|
||||
from .base import BaseDocumentStore
|
||||
|
||||
MAX_DOCS_TO_GET = 10**4
|
||||
@@ -27,6 +27,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
|
||||
self.elasticsearch_url = elasticsearch_url
|
||||
self.index_name = index_name
|
||||
self.k1 = k1
|
||||
self.b = b
|
||||
|
||||
# Create an Elasticsearch client instance
|
||||
self.client = Elasticsearch(elasticsearch_url)
|
||||
@@ -160,10 +162,10 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
self.client.delete_by_query(index=self.index_name, body=query)
|
||||
self.client.indices.refresh(index=self.index_name)
|
||||
|
||||
def save(self, path: Union[str, Path]):
|
||||
"""Save document to path"""
|
||||
# not required for ElasticDocstore
|
||||
|
||||
def load(self, path: Union[str, Path]):
|
||||
"""Load document store from path"""
|
||||
# not required for ElasticDocstore
|
||||
def __persist_flow__(self):
|
||||
return {
|
||||
"index_name": self.index_name,
|
||||
"elasticsearch_url": self.elasticsearch_url,
|
||||
"k1": self.k1,
|
||||
"b": self.b,
|
||||
}
|
||||
|
@@ -2,7 +2,8 @@ import json
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...base import Document
|
||||
from kotaemon.base import Document
|
||||
|
||||
from .base import BaseDocumentStore
|
||||
|
||||
|
||||
@@ -74,3 +75,6 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
with open(path) as f:
|
||||
store = json.load(f)
|
||||
self._store = {key: Document.from_dict(value) for key, value in store.items()}
|
||||
|
||||
def __persist_flow__(self):
|
||||
return {}
|
||||
|
44
knowledgehub/storages/docstores/simple_file.py
Normal file
44
knowledgehub/storages/docstores/simple_file.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from kotaemon.base import Document
|
||||
|
||||
from .in_memory import InMemoryDocumentStore
|
||||
|
||||
|
||||
class SimpleFileDocumentStore(InMemoryDocumentStore):
|
||||
"""Improve InMemoryDocumentStore by auto saving whenever the corpus is changed"""
|
||||
|
||||
def __init__(self, path: str | Path):
|
||||
super().__init__()
|
||||
self._path = path
|
||||
if path is not None and Path(path).is_file():
|
||||
self.load(path)
|
||||
|
||||
def add(
|
||||
self,
|
||||
docs: Union[Document, List[Document]],
|
||||
ids: Optional[Union[List[str], str]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Add document into document store
|
||||
|
||||
Args:
|
||||
docs: list of documents to add
|
||||
ids: specify the ids of documents to add or
|
||||
use existing doc.doc_id
|
||||
exist_ok: raise error when duplicate doc-id
|
||||
found in the docstore (default to False)
|
||||
"""
|
||||
super().add(docs=docs, ids=ids, **kwargs)
|
||||
self.save(self._path)
|
||||
|
||||
def delete(self, ids: Union[List[str], str]):
|
||||
"""Delete document by id"""
|
||||
super().delete(ids=ids)
|
||||
self.save(self._path)
|
||||
|
||||
def __persist_flow__(self):
|
||||
from theflow.utils.modules import serialize
|
||||
|
||||
return {"path": serialize(self._path)}
|
@@ -1,5 +1,11 @@
|
||||
from .base import BaseVectorStore
|
||||
from .chroma import ChromaVectorStore
|
||||
from .in_memory import InMemoryVectorStore
|
||||
from .simple_file import SimpleFileVectorStore
|
||||
|
||||
__all__ = ["BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore"]
|
||||
__all__ = [
|
||||
"BaseVectorStore",
|
||||
"ChromaVectorStore",
|
||||
"InMemoryVectorStore",
|
||||
"SimpleFileVectorStore",
|
||||
]
|
||||
|
@@ -1,12 +1,14 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, List, Optional, Tuple, Type, Union
|
||||
from typing import Any, Optional
|
||||
|
||||
from llama_index.schema import NodeRelationship, RelatedNodeInfo
|
||||
from llama_index.vector_stores.types import BasePydanticVectorStore
|
||||
from llama_index.vector_stores.types import VectorStore as LIVectorStore
|
||||
from llama_index.vector_stores.types import VectorStoreQuery
|
||||
|
||||
from kotaemon.base import Document, DocumentWithEmbedding
|
||||
from kotaemon.base import DocumentWithEmbedding
|
||||
|
||||
|
||||
class BaseVectorStore(ABC):
|
||||
@@ -17,10 +19,10 @@ class BaseVectorStore(ABC):
|
||||
@abstractmethod
|
||||
def add(
|
||||
self,
|
||||
embeddings: List[List[float]] | List[DocumentWithEmbedding],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
) -> List[str]:
|
||||
embeddings: list[list[float]] | list[DocumentWithEmbedding],
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
ids: Optional[list[str]] = None,
|
||||
) -> list[str]:
|
||||
"""Add vector embeddings to vector stores
|
||||
|
||||
Args:
|
||||
@@ -35,16 +37,7 @@ class BaseVectorStore(ABC):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def add_from_docs(self, docs: List[Document]):
|
||||
"""Add vector embeddings to vector stores
|
||||
|
||||
Args:
|
||||
docs: List of Document objects
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def delete(self, ids: List[str], **kwargs):
|
||||
def delete(self, ids: list[str], **kwargs):
|
||||
"""Delete vector embeddings from vector stores
|
||||
|
||||
Args:
|
||||
@@ -56,11 +49,11 @@ class BaseVectorStore(ABC):
|
||||
@abstractmethod
|
||||
def query(
|
||||
self,
|
||||
embedding: List[float],
|
||||
embedding: list[float],
|
||||
top_k: int = 1,
|
||||
ids: Optional[List[str]] = None,
|
||||
ids: Optional[list[str]] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[List[List[float]], List[float], List[str]]:
|
||||
) -> tuple[list[list[float]], list[float], list[str]]:
|
||||
"""Return the top k most similar vector embeddings
|
||||
|
||||
Args:
|
||||
@@ -73,17 +66,9 @@ class BaseVectorStore(ABC):
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def load(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class LlamaIndexVectorStore(BaseVectorStore):
|
||||
_li_class: Type[Union[LIVectorStore, BasePydanticVectorStore]]
|
||||
_li_class: type[LIVectorStore | BasePydanticVectorStore]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if self._li_class is None:
|
||||
@@ -104,12 +89,12 @@ class LlamaIndexVectorStore(BaseVectorStore):
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: List[List[float]] | List[DocumentWithEmbedding],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
embeddings: list[list[float]] | list[DocumentWithEmbedding],
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
ids: Optional[list[str]] = None,
|
||||
):
|
||||
if isinstance(embeddings[0], list):
|
||||
nodes = [
|
||||
nodes: list[DocumentWithEmbedding] = [
|
||||
DocumentWithEmbedding(embedding=embedding) for embedding in embeddings
|
||||
]
|
||||
else:
|
||||
@@ -126,20 +111,17 @@ class LlamaIndexVectorStore(BaseVectorStore):
|
||||
|
||||
return self._client.add(nodes=nodes)
|
||||
|
||||
def add_from_docs(self, docs: List[Document]):
|
||||
return self._client.add(nodes=docs)
|
||||
|
||||
def delete(self, ids: List[str], **kwargs):
|
||||
def delete(self, ids: list[str], **kwargs):
|
||||
for id_ in ids:
|
||||
self._client.delete(ref_doc_id=id_, **kwargs)
|
||||
|
||||
def query(
|
||||
self,
|
||||
embedding: List[float],
|
||||
embedding: list[float],
|
||||
top_k: int = 1,
|
||||
ids: Optional[List[str]] = None,
|
||||
ids: Optional[list[str]] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[List[List[float]], List[float], List[str]]:
|
||||
) -> tuple[list[list[float]], list[float], list[str]]:
|
||||
output = self._client.query(
|
||||
query=VectorStoreQuery(
|
||||
query_embedding=embedding,
|
||||
|
@@ -21,6 +21,17 @@ class ChromaVectorStore(LlamaIndexVectorStore):
|
||||
flat_metadata: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
self._path = path
|
||||
self._collection_name = collection_name
|
||||
self._host = host
|
||||
self._port = port
|
||||
self._ssl = ssl
|
||||
self._headers = headers
|
||||
self._collection_kwargs = collection_kwargs
|
||||
self._stores_text = stores_text
|
||||
self._flat_metadata = flat_metadata
|
||||
self._kwargs = kwargs
|
||||
|
||||
try:
|
||||
import chromadb
|
||||
except ImportError:
|
||||
@@ -70,8 +81,16 @@ class ChromaVectorStore(LlamaIndexVectorStore):
|
||||
def count(self) -> int:
|
||||
return self._collection.count()
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def load(self, *args, **kwargs):
|
||||
pass
|
||||
def __persist_flow__(self):
|
||||
return {
|
||||
"path": self._path,
|
||||
"collection_name": self._collection_name,
|
||||
"host": self._host,
|
||||
"port": self._port,
|
||||
"ssl": self._ssl,
|
||||
"headers": self._headers,
|
||||
"collection_kwargs": self._collection_kwargs,
|
||||
"stores_text": self._stores_text,
|
||||
"flat_metadata": self._flat_metadata,
|
||||
**self._kwargs,
|
||||
}
|
||||
|
@@ -1,5 +1,4 @@
|
||||
"""Simple vector store index."""
|
||||
|
||||
from typing import Any, Optional, Type
|
||||
|
||||
import fsspec
|
||||
@@ -53,3 +52,11 @@ class InMemoryVectorStore(LlamaIndexVectorStore):
|
||||
fs: An abstract super-class for pythonic file-systems
|
||||
"""
|
||||
self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)
|
||||
|
||||
def __persist_flow__(self):
|
||||
d = self._data.to_dict()
|
||||
d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
|
||||
return {
|
||||
"data": d,
|
||||
# "fs": self._fs,
|
||||
}
|
||||
|
66
knowledgehub/storages/vectorstores/simple_file.py
Normal file
66
knowledgehub/storages/vectorstores/simple_file.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Simple file vector store index."""
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Type
|
||||
|
||||
import fsspec
|
||||
from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore
|
||||
from llama_index.vector_stores.simple import SimpleVectorStoreData
|
||||
|
||||
from kotaemon.base import DocumentWithEmbedding
|
||||
|
||||
from .base import LlamaIndexVectorStore
|
||||
|
||||
|
||||
class SimpleFileVectorStore(LlamaIndexVectorStore):
|
||||
"""Similar to InMemoryVectorStore but is backed by file by default"""
|
||||
|
||||
_li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
|
||||
store_text: bool = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: str | Path,
|
||||
data: Optional[SimpleVectorStoreData] = None,
|
||||
fs: Optional[fsspec.AbstractFileSystem] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize params."""
|
||||
self._data = data or SimpleVectorStoreData()
|
||||
self._fs = fs or fsspec.filesystem("file")
|
||||
self._path = path
|
||||
self._save_path = Path(path)
|
||||
|
||||
super().__init__(
|
||||
data=data,
|
||||
fs=fs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if self._save_path.is_file():
|
||||
self._client = self._li_class.from_persist_path(
|
||||
persist_path=str(self._save_path), fs=self._fs
|
||||
)
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: list[list[float]] | list[DocumentWithEmbedding],
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
ids: Optional[list[str]] = None,
|
||||
):
|
||||
r = super().add(embeddings, metadatas, ids)
|
||||
self._client.persist(str(self._save_path), self._fs)
|
||||
return r
|
||||
|
||||
def delete(self, ids: list[str], **kwargs):
|
||||
r = super().delete(ids, **kwargs)
|
||||
self._client.persist(str(self._save_path), self._fs)
|
||||
return r
|
||||
|
||||
def __persist_flow__(self):
|
||||
d = self._data.to_dict()
|
||||
d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
|
||||
return {
|
||||
"data": d,
|
||||
"path": str(self._path),
|
||||
# "fs": self._fs,
|
||||
}
|
Reference in New Issue
Block a user