[AUR-420] Provide document store base interface and an in-memory version (#21)

Document store handles storing and indexing Documents. It supports the following interfaces:

- add: add 1 or more documents into document store
- get: get a list of documents
- get_all: get all documents in a document store
- delete: delete 1 or more document
- save: persist a document store into disk
- load: load a document store from disk
This commit is contained in:
Nguyen Trung Duc (john) 2023-09-19 14:49:23 +07:00 committed by GitHub
parent 620b2b03ca
commit 2a3a23ecd7
5 changed files with 185 additions and 1 deletions

View File

@ -0,0 +1,4 @@
from .base import BaseDocumentStore
from .simple import InMemoryDocumentStore
__all__ = ["BaseDocumentStore", "InMemoryDocumentStore"]

View File

@ -0,0 +1,54 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Optional, Union
from ..documents.base import Document
class BaseDocumentStore(ABC):
"""A document store is in charged of storing and managing documents"""
@abstractmethod
def __init__(self, *args, **kwargs):
...
@abstractmethod
def add(
self,
docs: Union[Document, List[Document]],
ids: Optional[Union[List[str], str]] = None,
exist_ok: bool = False,
):
"""Add document into document store
Args:
docs: Document or list of documents
ids: List of ids of the documents. Optional, if not set will use doc.doc_id
exist_ok: If True, will not raise error if document already exist
"""
...
@abstractmethod
def get(self, ids: Union[List[str], str]) -> List[Document]:
"""Get document by id"""
...
@abstractmethod
def get_all(self) -> dict:
"""Get all documents"""
...
@abstractmethod
def delete(self, ids: Union[List[str], str]):
"""Delete document by id"""
...
@abstractmethod
def save(self, path: Union[str, Path]):
"""Save document to path"""
...
@abstractmethod
def load(self, path: Union[str, Path]):
"""Load document store from path"""
...

View File

@ -0,0 +1,68 @@
import json
from pathlib import Path
from typing import List, Optional, Union
from ..documents.base import Document
from .base import BaseDocumentStore
class InMemoryDocumentStore(BaseDocumentStore):
"""Simple memory document store that store document in a dictionary"""
def __init__(self):
self.store = {}
def add(
self,
docs: Union[Document, List[Document]],
ids: Optional[Union[List[str], str]] = None,
exist_ok: bool = False,
):
"""Add document into document store
Args:
docs: Union[Document, List[Document]],
ids: Optional[Union[List[str], str]] = None,
"""
doc_ids = ids if ids else [doc.doc_id for doc in docs]
if not isinstance(doc_ids, list):
doc_ids = [doc_ids]
if not isinstance(docs, list):
docs = [docs]
for doc_id, doc in zip(doc_ids, docs):
if doc_id in self.store and not exist_ok:
raise ValueError(f"Document with id {doc_id} already exist")
self.store[doc_id] = doc
def get(self, ids: Union[List[str], str]) -> List[Document]:
"""Get document by id"""
if not isinstance(ids, list):
ids = [ids]
return [self.store[doc_id] for doc_id in ids]
def get_all(self) -> dict:
"""Get all documents"""
return self.store
def delete(self, ids: Union[List[str], str]):
"""Delete document by id"""
if not isinstance(ids, list):
ids = [ids]
for doc_id in ids:
del self.store[doc_id]
def save(self, path: Union[str, Path]):
"""Save document to path"""
store = {key: value.to_dict() for key, value in self.store.items()}
with open(path, "w") as f:
json.dump(store, f)
def load(self, path: Union[str, Path]):
"""Load document store from path"""
with open(path) as f:
store = json.load(f)
self.store = {key: Document.from_dict(value) for key, value in store.items()}

View File

@ -1,7 +1,7 @@
from abc import abstractmethod from abc import abstractmethod
from typing import List, Type from typing import List, Type
from langchain.embeddings.base import Embeddings as LCEmbeddings from langchain.schema.embeddings import Embeddings as LCEmbeddings
from theflow import Param from theflow import Param
from ..components import BaseComponent from ..components import BaseComponent

58
tests/test_docstores.py Normal file
View File

@ -0,0 +1,58 @@
import pytest
from kotaemon.docstores import InMemoryDocumentStore
from kotaemon.documents.base import Document
def test_simple_document_store_base_interfaces(tmp_path):
"""Test all interfaces of a a document store"""
store = InMemoryDocumentStore()
docs = [
Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"})
for idx in range(10)
]
# Test add and get all
assert len(store.get_all()) == 0, "Document store should be empty"
store.add(docs)
assert len(store.get_all()) == 10, "Document store should have 10 documents"
# Test add with provided ids
store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)])
assert len(store.get_all()) == 20, "Document store should have 20 documents"
# Test add without exist_ok
with pytest.raises(ValueError):
store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)])
# Update ok with add exist_ok
store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)], exist_ok=True)
assert len(store.get_all()) == 20, "Document store should have 20 documents"
# Test get with str id
matched = store.get(docs[0].doc_id)
assert len(matched) == 1, "Should return 1 document"
assert matched[0].text == docs[0].text, "Should return the correct document"
# Test get with list of ids
matched = store.get([docs[0].doc_id, docs[1].doc_id])
assert len(matched) == 2, "Should return 2 documents"
assert [doc.text for doc in matched] == [doc.text for doc in docs[:2]]
# Test delete with str id
store.delete(docs[0].doc_id)
assert len(store.get_all()) == 19, "Document store should have 19 documents"
# Test delete with list of ids
store.delete([docs[1].doc_id, docs[2].doc_id])
assert len(store.get_all()) == 17, "Document store should have 17 documents"
# Test save
store.save(tmp_path / "store.json")
assert (tmp_path / "store.json").exists(), "File should exist"
# Test load
store2 = InMemoryDocumentStore()
store2.load(tmp_path / "store.json")
assert len(store2.get_all()) == 17, "Laded document store should have 17 documents"