Migrate the MVP into kotaemon (#108)

- Migrate the MVP into kotaemon.
- Preliminary include the pipeline within chatbot interface.
- Organize MVP as an application.

Todo:

- Add an info panel to view the planning of agents -> Fix streaming agents' output.

Resolve: #60
Resolve: #61 
Resolve: #62
This commit is contained in:
Duc Nguyen (john)
2024-01-10 15:28:09 +07:00
committed by GitHub
parent 230328c62f
commit 5a9d6f75be
31 changed files with 273 additions and 92 deletions

View File

@@ -16,6 +16,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
elasticsearch_url: str = "http://localhost:9200",
k1: float = 2.0,
b: float = 0.75,
**kwargs,
):
try:
from elasticsearch import Elasticsearch
@@ -31,7 +32,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.b = b
# Create an Elasticsearch client instance
self.client = Elasticsearch(elasticsearch_url)
self.client = Elasticsearch(elasticsearch_url, **kwargs)
self.es_bulk = bulk
# Define the index settings and mappings
settings = {
@@ -63,19 +64,16 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self,
docs: Union[Document, List[Document]],
ids: Optional[Union[List[str], str]] = None,
**kwargs
refresh_indices: bool = True,
**kwargs,
):
"""Add document into document store
Args:
docs: list of documents to add
ids: specify the ids of documents to add or
use existing doc.doc_id
refresh_indices: request Elasticsearch to update
its index (default to True)
ids: specify the ids of documents to add or use existing doc.doc_id
refresh_indices: request Elasticsearch to update its index (default to True)
"""
refresh_indices = kwargs.pop("refresh_indices", True)
if ids and not isinstance(ids, list):
ids = [ids]
if not isinstance(docs, list):
@@ -120,7 +118,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
)
return docs
def query(self, query: str, top_k: int = 10) -> List[Document]:
def query(
self, query: str, top_k: int = 10, doc_ids: Optional[list] = None
) -> List[Document]:
"""Search Elasticsearch docstore using search query (BM25)
Args:
@@ -131,7 +131,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
Returns:
List[Document]: List of result documents
"""
query_dict = {"query": {"match": {"content": query}}, "size": top_k}
query_dict: dict = {"query": {"match": {"content": query}}, "size": top_k}
if doc_ids:
query_dict["query"]["match"]["_id"] = {"values": doc_ids}
return self.query_raw(query_dict)
def get(self, ids: Union[List[str], str]) -> List[Document]:

View File

@@ -74,6 +74,11 @@ class InMemoryDocumentStore(BaseDocumentStore):
"""Load document store from path"""
with open(path) as f:
store = json.load(f)
# TODO: save and load aren't lossless. A Document-subclass will lose
# information. Need to edit the `to_dict` and `from_dict` methods in
# the Document class.
# For better query support, utilize SQLite as the default document store.
# Also, for portability, use SQLAlchemy for document store.
self._store = {key: Document.from_dict(value) for key, value in store.items()}
def __persist_flow__(self):

View File

@@ -15,6 +15,18 @@ class SimpleFileDocumentStore(InMemoryDocumentStore):
if path is not None and Path(path).is_file():
self.load(path)
def get(self, ids: Union[List[str], str]) -> List[Document]:
"""Get document by id"""
if not isinstance(ids, list):
ids = [ids]
for doc_id in ids:
if doc_id not in self._store:
self.load(self._path)
break
return [self._store[doc_id] for doc_id in ids]
def add(
self,
docs: Union[Document, List[Document]],