Migrate the MVP into kotaemon (#108)

- Migrate the MVP into kotaemon.
- Preliminary include the pipeline within chatbot interface.
- Organize MVP as an application.

Todo:

- Add an info panel to view the planning of agents -> Fix streaming agents' output.

Resolve: #60
Resolve: #61 
Resolve: #62
This commit is contained in:
Duc Nguyen (john)
2024-01-10 15:28:09 +07:00
committed by GitHub
parent 230328c62f
commit 5a9d6f75be
31 changed files with 273 additions and 92 deletions

View File

@@ -16,6 +16,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
elasticsearch_url: str = "http://localhost:9200",
k1: float = 2.0,
b: float = 0.75,
**kwargs,
):
try:
from elasticsearch import Elasticsearch
@@ -31,7 +32,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.b = b
# Create an Elasticsearch client instance
self.client = Elasticsearch(elasticsearch_url)
self.client = Elasticsearch(elasticsearch_url, **kwargs)
self.es_bulk = bulk
# Define the index settings and mappings
settings = {
@@ -63,19 +64,16 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self,
docs: Union[Document, List[Document]],
ids: Optional[Union[List[str], str]] = None,
**kwargs
refresh_indices: bool = True,
**kwargs,
):
"""Add document into document store
Args:
docs: list of documents to add
ids: specify the ids of documents to add or
use existing doc.doc_id
refresh_indices: request Elasticsearch to update
its index (default to True)
ids: specify the ids of documents to add or use existing doc.doc_id
refresh_indices: request Elasticsearch to update its index (default to True)
"""
refresh_indices = kwargs.pop("refresh_indices", True)
if ids and not isinstance(ids, list):
ids = [ids]
if not isinstance(docs, list):
@@ -120,7 +118,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
)
return docs
def query(self, query: str, top_k: int = 10) -> List[Document]:
def query(
self, query: str, top_k: int = 10, doc_ids: Optional[list] = None
) -> List[Document]:
"""Search Elasticsearch docstore using search query (BM25)
Args:
@@ -131,7 +131,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
Returns:
List[Document]: List of result documents
"""
query_dict = {"query": {"match": {"content": query}}, "size": top_k}
query_dict: dict = {"query": {"match": {"content": query}}, "size": top_k}
if doc_ids:
query_dict["query"]["match"]["_id"] = {"values": doc_ids}
return self.query_raw(query_dict)
def get(self, ids: Union[List[str], str]) -> List[Document]:

View File

@@ -74,6 +74,11 @@ class InMemoryDocumentStore(BaseDocumentStore):
"""Load document store from path"""
with open(path) as f:
store = json.load(f)
# TODO: save and load aren't lossless. A Document-subclass will lose
# information. Need to edit the `to_dict` and `from_dict` methods in
# the Document class.
# For better query support, utilize SQLite as the default document store.
# Also, for portability, use SQLAlchemy for document store.
self._store = {key: Document.from_dict(value) for key, value in store.items()}
def __persist_flow__(self):

View File

@@ -15,6 +15,18 @@ class SimpleFileDocumentStore(InMemoryDocumentStore):
if path is not None and Path(path).is_file():
self.load(path)
def get(self, ids: Union[List[str], str]) -> List[Document]:
"""Get document by id"""
if not isinstance(ids, list):
ids = [ids]
for doc_id in ids:
if doc_id not in self._store:
self.load(self._path)
break
return [self._store[doc_id] for doc_id in ids]
def add(
self,
docs: Union[Document, List[Document]],

View File

@@ -76,8 +76,15 @@ class LlamaIndexVectorStore(BaseVectorStore):
"Require `_li_class` to set a VectorStore class from LlamarIndex"
)
from dataclasses import fields
self._client = self._li_class(*args, **kwargs)
self._vsq_kwargs = {_.name for _ in fields(VectorStoreQuery)}
for key in ["query_embedding", "similarity_top_k", "node_ids"]:
if key in self._vsq_kwargs:
self._vsq_kwargs.remove(key)
def __setattr__(self, name: str, value: Any) -> None:
if name.startswith("_"):
return super().__setattr__(name, value)
@@ -122,13 +129,35 @@ class LlamaIndexVectorStore(BaseVectorStore):
ids: Optional[list[str]] = None,
**kwargs,
) -> tuple[list[list[float]], list[float], list[str]]:
"""Return the top k most similar vector embeddings
Args:
embedding: List of embeddings
top_k: Number of most similar embeddings to return
ids: List of ids of the embeddings to be queried
kwargs: extra query parameters. Depending on the name, these parameters
will be used when constructing the VectorStoreQuery object or when
performing querying of the underlying vector store.
Returns:
the matched embeddings, the similarity scores, and the ids
"""
vsq_kwargs = {}
vs_kwargs = {}
for kwkey, kwvalue in kwargs.items():
if kwkey in self._vsq_kwargs:
vsq_kwargs[kwkey] = kwvalue
else:
vs_kwargs[kwkey] = kwvalue
output = self._client.query(
query=VectorStoreQuery(
query_embedding=embedding,
similarity_top_k=top_k,
node_ids=ids,
**kwargs,
**vsq_kwargs,
),
**vs_kwargs,
)
embeddings = []

View File

@@ -64,7 +64,7 @@ class ChromaVectorStore(LlamaIndexVectorStore):
ids: List of ids of the embeddings to be deleted
kwargs: meant for vectorstore-specific parameters
"""
self._client._collection.delete(ids=ids)
self._client.client.delete(ids=ids)
def delete_collection(self, collection_name: Optional[str] = None):
"""Delete entire collection under specified name from vector stores