Add file-based document store and vector store (#96)
* Modify docstore and vectorstore objects to be reconstructable * Simplify the file docstore * Use the simple file docstore and vector store in MVP
This commit is contained in:
committed by
GitHub
parent
0ce3a8832f
commit
37c744b616
@@ -1,5 +1,11 @@
|
||||
from .base import BaseDocumentStore
|
||||
from .elasticsearch import ElasticsearchDocumentStore
|
||||
from .in_memory import InMemoryDocumentStore
|
||||
from .simple_file import SimpleFileDocumentStore
|
||||
|
||||
__all__ = ["BaseDocumentStore", "InMemoryDocumentStore", "ElasticsearchDocumentStore"]
|
||||
__all__ = [
|
||||
"BaseDocumentStore",
|
||||
"InMemoryDocumentStore",
|
||||
"ElasticsearchDocumentStore",
|
||||
"SimpleFileDocumentStore",
|
||||
]
|
||||
|
@@ -1,8 +1,7 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...base import Document
|
||||
from kotaemon.base import Document
|
||||
|
||||
|
||||
class BaseDocumentStore(ABC):
|
||||
@@ -46,13 +45,3 @@ class BaseDocumentStore(ABC):
|
||||
def delete(self, ids: Union[List[str], str]):
|
||||
"""Delete document by id"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def save(self, path: Union[str, Path]):
|
||||
"""Save document to path"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def load(self, path: Union[str, Path]):
|
||||
"""Load document store from path"""
|
||||
...
|
||||
|
@@ -1,7 +1,7 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...base import Document
|
||||
from kotaemon.base import Document
|
||||
|
||||
from .base import BaseDocumentStore
|
||||
|
||||
MAX_DOCS_TO_GET = 10**4
|
||||
@@ -27,6 +27,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
|
||||
self.elasticsearch_url = elasticsearch_url
|
||||
self.index_name = index_name
|
||||
self.k1 = k1
|
||||
self.b = b
|
||||
|
||||
# Create an Elasticsearch client instance
|
||||
self.client = Elasticsearch(elasticsearch_url)
|
||||
@@ -160,10 +162,10 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
self.client.delete_by_query(index=self.index_name, body=query)
|
||||
self.client.indices.refresh(index=self.index_name)
|
||||
|
||||
def save(self, path: Union[str, Path]):
|
||||
"""Save document to path"""
|
||||
# not required for ElasticDocstore
|
||||
|
||||
def load(self, path: Union[str, Path]):
|
||||
"""Load document store from path"""
|
||||
# not required for ElasticDocstore
|
||||
def __persist_flow__(self):
|
||||
return {
|
||||
"index_name": self.index_name,
|
||||
"elasticsearch_url": self.elasticsearch_url,
|
||||
"k1": self.k1,
|
||||
"b": self.b,
|
||||
}
|
||||
|
@@ -2,7 +2,8 @@ import json
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...base import Document
|
||||
from kotaemon.base import Document
|
||||
|
||||
from .base import BaseDocumentStore
|
||||
|
||||
|
||||
@@ -74,3 +75,6 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
with open(path) as f:
|
||||
store = json.load(f)
|
||||
self._store = {key: Document.from_dict(value) for key, value in store.items()}
|
||||
|
||||
def __persist_flow__(self):
|
||||
return {}
|
||||
|
44
knowledgehub/storages/docstores/simple_file.py
Normal file
44
knowledgehub/storages/docstores/simple_file.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from kotaemon.base import Document
|
||||
|
||||
from .in_memory import InMemoryDocumentStore
|
||||
|
||||
|
||||
class SimpleFileDocumentStore(InMemoryDocumentStore):
|
||||
"""Improve InMemoryDocumentStore by auto saving whenever the corpus is changed"""
|
||||
|
||||
def __init__(self, path: str | Path):
|
||||
super().__init__()
|
||||
self._path = path
|
||||
if path is not None and Path(path).is_file():
|
||||
self.load(path)
|
||||
|
||||
def add(
|
||||
self,
|
||||
docs: Union[Document, List[Document]],
|
||||
ids: Optional[Union[List[str], str]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Add document into document store
|
||||
|
||||
Args:
|
||||
docs: list of documents to add
|
||||
ids: specify the ids of documents to add or
|
||||
use existing doc.doc_id
|
||||
exist_ok: raise error when duplicate doc-id
|
||||
found in the docstore (default to False)
|
||||
"""
|
||||
super().add(docs=docs, ids=ids, **kwargs)
|
||||
self.save(self._path)
|
||||
|
||||
def delete(self, ids: Union[List[str], str]):
|
||||
"""Delete document by id"""
|
||||
super().delete(ids=ids)
|
||||
self.save(self._path)
|
||||
|
||||
def __persist_flow__(self):
|
||||
from theflow.utils.modules import serialize
|
||||
|
||||
return {"path": serialize(self._path)}
|
Reference in New Issue
Block a user