Migrate the MVP into kotaemon (#108)

- Migrate the MVP into kotaemon. - Preliminary include the pipeline within chatbot interface. - Organize MVP as an application. Todo: - Add an info panel to view the planning of agents -> Fix streaming agents' output. Resolve: #60 Resolve: #61 Resolve: #62
2024-01-10 15:28:09 +07:00
parent 230328c62f
commit 5a9d6f75be
31 changed files with 273 additions and 92 deletions
--- a/knowledgehub/storages/docstores/elasticsearch.py
+++ b/knowledgehub/storages/docstores/elasticsearch.py
@@ -16,6 +16,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        elasticsearch_url: str = "http://localhost:9200",
        k1: float = 2.0,
        b: float = 0.75,
+        **kwargs,
    ):
        try:
            from elasticsearch import Elasticsearch
@@ -31,7 +32,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        self.b = b

        # Create an Elasticsearch client instance
-        self.client = Elasticsearch(elasticsearch_url)
+        self.client = Elasticsearch(elasticsearch_url, **kwargs)
        self.es_bulk = bulk
        # Define the index settings and mappings
        settings = {
@@ -63,19 +64,16 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        self,
        docs: Union[Document, List[Document]],
        ids: Optional[Union[List[str], str]] = None,
-        **kwargs
+        refresh_indices: bool = True,
+        **kwargs,
    ):
        """Add document into document store

        Args:
            docs: list of documents to add
-            ids: specify the ids of documents to add or
-                use existing doc.doc_id
-            refresh_indices: request Elasticsearch to update
-                its index (default to True)
+            ids: specify the ids of documents to add or use existing doc.doc_id
+            refresh_indices: request Elasticsearch to update its index (default to True)
        """
-        refresh_indices = kwargs.pop("refresh_indices", True)
-
        if ids and not isinstance(ids, list):
            ids = [ids]
        if not isinstance(docs, list):
@@ -120,7 +118,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
            )
        return docs

-    def query(self, query: str, top_k: int = 10) -> List[Document]:
+    def query(
+        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None
+    ) -> List[Document]:
        """Search Elasticsearch docstore using search query (BM25)

        Args:
@@ -131,7 +131,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        Returns:
            List[Document]: List of result documents
        """
-        query_dict = {"query": {"match": {"content": query}}, "size": top_k}
+        query_dict: dict = {"query": {"match": {"content": query}}, "size": top_k}
+        if doc_ids:
+            query_dict["query"]["match"]["_id"] = {"values": doc_ids}
        return self.query_raw(query_dict)

    def get(self, ids: Union[List[str], str]) -> List[Document]:
--- a/knowledgehub/storages/docstores/in_memory.py
+++ b/knowledgehub/storages/docstores/in_memory.py
@@ -74,6 +74,11 @@ class InMemoryDocumentStore(BaseDocumentStore):
        """Load document store from path"""
        with open(path) as f:
            store = json.load(f)
+        # TODO: save and load aren't lossless. A Document-subclass will lose
+        # information. Need to edit the `to_dict` and `from_dict` methods in
+        # the Document class.
+        # For better query support, utilize SQLite as the default document store.
+        # Also, for portability, use SQLAlchemy for document store.
        self._store = {key: Document.from_dict(value) for key, value in store.items()}

    def __persist_flow__(self):
--- a/knowledgehub/storages/docstores/simple_file.py
+++ b/knowledgehub/storages/docstores/simple_file.py
@@ -15,6 +15,18 @@ class SimpleFileDocumentStore(InMemoryDocumentStore):
        if path is not None and Path(path).is_file():
            self.load(path)

+    def get(self, ids: Union[List[str], str]) -> List[Document]:
+        """Get document by id"""
+        if not isinstance(ids, list):
+            ids = [ids]
+
+        for doc_id in ids:
+            if doc_id not in self._store:
+                self.load(self._path)
+                break
+
+        return [self._store[doc_id] for doc_id in ids]
+
    def add(
        self,
        docs: Union[Document, List[Document]],