Restructure index to allow it to be dynamically created by end-user (#151)
1. Introduce the concept of "collection_name" to docstore and vector store. Each collection can be viewed similarly to a table in a SQL database. It allows better organizing information within this data source. 2. Move the `Index` and `Source` tables from the application scope into the index scope. For each new index created by user, these tables should increase accordingly. So it depends on the index, rather than the app. 3. Make each index responsible for the UI components in the app. 4. Construct the File UI page.
This commit is contained in:
committed by
GitHub
parent
cc87aaa783
commit
8a90fcfc99
@@ -45,3 +45,8 @@ class BaseDocumentStore(ABC):
|
||||
def delete(self, ids: Union[List[str], str]):
|
||||
"""Delete document by id"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def drop(self):
|
||||
"""Drop the document store"""
|
||||
...
|
||||
|
@@ -12,7 +12,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
index_name: str = "docstore",
|
||||
collection_name: str = "docstore",
|
||||
elasticsearch_url: str = "http://localhost:9200",
|
||||
k1: float = 2.0,
|
||||
b: float = 0.75,
|
||||
@@ -27,7 +27,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
)
|
||||
|
||||
self.elasticsearch_url = elasticsearch_url
|
||||
self.index_name = index_name
|
||||
self.index_name = collection_name
|
||||
self.k1 = k1
|
||||
self.b = b
|
||||
|
||||
@@ -55,9 +55,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
}
|
||||
|
||||
# Create the index with the specified settings and mappings
|
||||
if not self.client.indices.exists(index=index_name):
|
||||
if not self.client.indices.exists(index=self.index_name):
|
||||
self.client.indices.create(
|
||||
index=index_name, mappings=mappings, settings=settings
|
||||
index=self.index_name, mappings=mappings, settings=settings
|
||||
)
|
||||
|
||||
def add(
|
||||
@@ -164,6 +164,11 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
self.client.delete_by_query(index=self.index_name, body=query)
|
||||
self.client.indices.refresh(index=self.index_name)
|
||||
|
||||
def drop(self):
|
||||
"""Drop the document store"""
|
||||
self.client.indices.delete(index=self.index_name)
|
||||
self.client.indices.refresh(index=self.index_name)
|
||||
|
||||
def __persist_flow__(self):
|
||||
return {
|
||||
"index_name": self.index_name,
|
||||
|
@@ -83,3 +83,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
|
||||
def __persist_flow__(self):
|
||||
return {}
|
||||
|
||||
def drop(self):
|
||||
"""Drop the document store"""
|
||||
self._store = {}
|
||||
|
@@ -9,11 +9,15 @@ from .in_memory import InMemoryDocumentStore
|
||||
class SimpleFileDocumentStore(InMemoryDocumentStore):
|
||||
"""Improve InMemoryDocumentStore by auto saving whenever the corpus is changed"""
|
||||
|
||||
def __init__(self, path: str | Path):
|
||||
def __init__(self, path: str | Path, collection_name: str = "default"):
|
||||
super().__init__()
|
||||
self._path = path
|
||||
if path is not None and Path(path).is_file():
|
||||
self.load(path)
|
||||
self._collection_name = collection_name
|
||||
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
self._save_path = Path(path) / f"{collection_name}.json"
|
||||
if self._save_path.is_file():
|
||||
self.load(self._save_path)
|
||||
|
||||
def get(self, ids: Union[List[str], str]) -> List[Document]:
|
||||
"""Get document by id"""
|
||||
@@ -22,7 +26,7 @@ class SimpleFileDocumentStore(InMemoryDocumentStore):
|
||||
|
||||
for doc_id in ids:
|
||||
if doc_id not in self._store:
|
||||
self.load(self._path)
|
||||
self.load(self._save_path)
|
||||
break
|
||||
|
||||
return [self._store[doc_id] for doc_id in ids]
|
||||
@@ -43,14 +47,22 @@ class SimpleFileDocumentStore(InMemoryDocumentStore):
|
||||
found in the docstore (default to False)
|
||||
"""
|
||||
super().add(docs=docs, ids=ids, **kwargs)
|
||||
self.save(self._path)
|
||||
self.save(self._save_path)
|
||||
|
||||
def delete(self, ids: Union[List[str], str]):
|
||||
"""Delete document by id"""
|
||||
super().delete(ids=ids)
|
||||
self.save(self._path)
|
||||
self.save(self._save_path)
|
||||
|
||||
def drop(self):
|
||||
"""Drop the document store"""
|
||||
super().drop()
|
||||
self._save_path.unlink(missing_ok=True)
|
||||
|
||||
def __persist_flow__(self):
|
||||
from theflow.utils.modules import serialize
|
||||
|
||||
return {"path": serialize(self._path)}
|
||||
return {
|
||||
"path": serialize(self._path),
|
||||
"collection_name": self._collection_name,
|
||||
}
|
||||
|
@@ -66,6 +66,11 @@ class BaseVectorStore(ABC):
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def drop(self):
|
||||
"""Drop the vector store"""
|
||||
...
|
||||
|
||||
|
||||
class LlamaIndexVectorStore(BaseVectorStore):
|
||||
_li_class: type[LIVectorStore | BasePydanticVectorStore]
|
||||
|
@@ -66,17 +66,9 @@ class ChromaVectorStore(LlamaIndexVectorStore):
|
||||
"""
|
||||
self._client.client.delete(ids=ids)
|
||||
|
||||
def delete_collection(self, collection_name: Optional[str] = None):
|
||||
"""Delete entire collection under specified name from vector stores
|
||||
|
||||
Args:
|
||||
collection_name: Name of the collection to delete
|
||||
"""
|
||||
# a rather ugly chain call but it do the job of finding
|
||||
# original chromadb client and call delete_collection() method
|
||||
if collection_name is None:
|
||||
collection_name = self._client.client.name
|
||||
self._client.client._client.delete_collection(collection_name)
|
||||
def drop(self):
|
||||
"""Delete entire collection from vector stores"""
|
||||
self._client.client._client.delete_collection(self._client.client.name)
|
||||
|
||||
def count(self) -> int:
|
||||
return self._collection.count()
|
||||
|
@@ -53,6 +53,10 @@ class InMemoryVectorStore(LlamaIndexVectorStore):
|
||||
"""
|
||||
self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)
|
||||
|
||||
def drop(self):
|
||||
"""Clear the old data"""
|
||||
self._data = SimpleVectorStoreData()
|
||||
|
||||
def __persist_flow__(self):
|
||||
d = self._data.to_dict()
|
||||
d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
|
||||
|
@@ -20,6 +20,7 @@ class SimpleFileVectorStore(LlamaIndexVectorStore):
|
||||
def __init__(
|
||||
self,
|
||||
path: str | Path,
|
||||
collection_name: str = "default",
|
||||
data: Optional[SimpleVectorStoreData] = None,
|
||||
fs: Optional[fsspec.AbstractFileSystem] = None,
|
||||
**kwargs: Any,
|
||||
@@ -27,8 +28,9 @@ class SimpleFileVectorStore(LlamaIndexVectorStore):
|
||||
"""Initialize params."""
|
||||
self._data = data or SimpleVectorStoreData()
|
||||
self._fs = fs or fsspec.filesystem("file")
|
||||
self._collection_name = collection_name
|
||||
self._path = path
|
||||
self._save_path = Path(path)
|
||||
self._save_path = Path(path) / collection_name
|
||||
|
||||
super().__init__(
|
||||
data=data,
|
||||
@@ -56,11 +58,16 @@ class SimpleFileVectorStore(LlamaIndexVectorStore):
|
||||
self._client.persist(str(self._save_path), self._fs)
|
||||
return r
|
||||
|
||||
def drop(self):
|
||||
self._data = SimpleVectorStoreData()
|
||||
self._save_path.unlink(missing_ok=True)
|
||||
|
||||
def __persist_flow__(self):
|
||||
d = self._data.to_dict()
|
||||
d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
|
||||
return {
|
||||
"data": d,
|
||||
"collection_name": self._collection_name,
|
||||
"path": str(self._path),
|
||||
# "fs": self._fs,
|
||||
}
|
||||
|
@@ -271,9 +271,7 @@ def test_inmemory_document_store_base_interfaces(tmp_path):
|
||||
def test_simplefile_document_store_base_interfaces(tmp_path):
|
||||
"""Test all interfaces of a a document store"""
|
||||
|
||||
path = tmp_path / "store.json"
|
||||
|
||||
store = SimpleFileDocumentStore(path=path)
|
||||
store = SimpleFileDocumentStore(path=tmp_path)
|
||||
docs = [
|
||||
Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"})
|
||||
for idx in range(10)
|
||||
@@ -315,13 +313,13 @@ def test_simplefile_document_store_base_interfaces(tmp_path):
|
||||
assert len(store.get_all()) == 17, "Document store should have 17 documents"
|
||||
|
||||
# Test save
|
||||
assert path.exists(), "File should exist"
|
||||
assert (tmp_path / "default.json").exists(), "File should exist"
|
||||
|
||||
# Test load
|
||||
store2 = SimpleFileDocumentStore(path=path)
|
||||
store2 = SimpleFileDocumentStore(path=tmp_path)
|
||||
assert len(store2.get_all()) == 17, "Laded document store should have 17 documents"
|
||||
|
||||
os.remove(path)
|
||||
os.remove(tmp_path / "default.json")
|
||||
|
||||
|
||||
@patch(
|
||||
@@ -329,7 +327,7 @@ def test_simplefile_document_store_base_interfaces(tmp_path):
|
||||
side_effect=_elastic_search_responses,
|
||||
)
|
||||
def test_elastic_document_store(elastic_api):
|
||||
store = ElasticsearchDocumentStore(index_name="test")
|
||||
store = ElasticsearchDocumentStore(collection_name="test")
|
||||
|
||||
docs = [
|
||||
Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"})
|
||||
|
@@ -81,7 +81,7 @@ class TestChromaVectorStore:
|
||||
), "load function does not load data completely"
|
||||
|
||||
# test delete collection function
|
||||
db2.delete_collection()
|
||||
db2.drop()
|
||||
# reinit the chroma with the same collection name
|
||||
db2 = ChromaVectorStore(path=str(tmp_path))
|
||||
assert (
|
||||
@@ -133,10 +133,11 @@ class TestSimpleFileVectorStore:
|
||||
embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]
|
||||
metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}]
|
||||
ids = ["1", "2", "3"]
|
||||
db = SimpleFileVectorStore(path=tmp_path / "test_save_load_delete.json")
|
||||
collection_name = "test_save_load_delete"
|
||||
db = SimpleFileVectorStore(path=tmp_path, collection_name=collection_name)
|
||||
db.add(embeddings=embeddings, metadatas=metadatas, ids=ids)
|
||||
db.delete(["3"])
|
||||
with open(tmp_path / "test_save_load_delete.json") as f:
|
||||
with open(tmp_path / collection_name) as f:
|
||||
data = json.load(f)
|
||||
assert (
|
||||
"1" and "2" in data["text_id_to_ref_doc_id"]
|
||||
@@ -144,11 +145,11 @@ class TestSimpleFileVectorStore:
|
||||
assert (
|
||||
"3" not in data["text_id_to_ref_doc_id"]
|
||||
), "delete function does not delete data completely"
|
||||
db2 = SimpleFileVectorStore(path=tmp_path / "test_save_load_delete.json")
|
||||
db2 = SimpleFileVectorStore(path=tmp_path, collection_name=collection_name)
|
||||
assert db2.get("2") == [
|
||||
0.4,
|
||||
0.5,
|
||||
0.6,
|
||||
], "load function does not load data completely"
|
||||
|
||||
os.remove(tmp_path / "test_save_load_delete.json")
|
||||
os.remove(tmp_path / collection_name)
|
||||
|
Reference in New Issue
Block a user