Migrate the MVP into kotaemon (#108)

- Migrate the MVP into kotaemon. - Preliminary include the pipeline within chatbot interface. - Organize MVP as an application. Todo: - Add an info panel to view the planning of agents -> Fix streaming agents' output. Resolve: #60 Resolve: #61 Resolve: #62
2024-01-10 15:28:09 +07:00
parent 230328c62f
commit 5a9d6f75be
31 changed files with 273 additions and 92 deletions
--- a/knowledgehub/storages/docstores/elasticsearch.py
+++ b/knowledgehub/storages/docstores/elasticsearch.py
@@ -16,6 +16,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        elasticsearch_url: str = "http://localhost:9200",
        k1: float = 2.0,
        b: float = 0.75,
+        **kwargs,
    ):
        try:
            from elasticsearch import Elasticsearch
@@ -31,7 +32,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        self.b = b

        # Create an Elasticsearch client instance
-        self.client = Elasticsearch(elasticsearch_url)
+        self.client = Elasticsearch(elasticsearch_url, **kwargs)
        self.es_bulk = bulk
        # Define the index settings and mappings
        settings = {
@@ -63,19 +64,16 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        self,
        docs: Union[Document, List[Document]],
        ids: Optional[Union[List[str], str]] = None,
-        **kwargs
+        refresh_indices: bool = True,
+        **kwargs,
    ):
        """Add document into document store

        Args:
            docs: list of documents to add
-            ids: specify the ids of documents to add or
-                use existing doc.doc_id
-            refresh_indices: request Elasticsearch to update
-                its index (default to True)
+            ids: specify the ids of documents to add or use existing doc.doc_id
+            refresh_indices: request Elasticsearch to update its index (default to True)
        """
-        refresh_indices = kwargs.pop("refresh_indices", True)
-
        if ids and not isinstance(ids, list):
            ids = [ids]
        if not isinstance(docs, list):
@@ -120,7 +118,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
            )
        return docs

-    def query(self, query: str, top_k: int = 10) -> List[Document]:
+    def query(
+        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None
+    ) -> List[Document]:
        """Search Elasticsearch docstore using search query (BM25)

        Args:
@@ -131,7 +131,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        Returns:
            List[Document]: List of result documents
        """
-        query_dict = {"query": {"match": {"content": query}}, "size": top_k}
+        query_dict: dict = {"query": {"match": {"content": query}}, "size": top_k}
+        if doc_ids:
+            query_dict["query"]["match"]["_id"] = {"values": doc_ids}
        return self.query_raw(query_dict)

    def get(self, ids: Union[List[str], str]) -> List[Document]:
--- a/knowledgehub/storages/docstores/in_memory.py
+++ b/knowledgehub/storages/docstores/in_memory.py
@@ -74,6 +74,11 @@ class InMemoryDocumentStore(BaseDocumentStore):
        """Load document store from path"""
        with open(path) as f:
            store = json.load(f)
+        # TODO: save and load aren't lossless. A Document-subclass will lose
+        # information. Need to edit the `to_dict` and `from_dict` methods in
+        # the Document class.
+        # For better query support, utilize SQLite as the default document store.
+        # Also, for portability, use SQLAlchemy for document store.
        self._store = {key: Document.from_dict(value) for key, value in store.items()}

    def __persist_flow__(self):
--- a/knowledgehub/storages/docstores/simple_file.py
+++ b/knowledgehub/storages/docstores/simple_file.py
@@ -15,6 +15,18 @@ class SimpleFileDocumentStore(InMemoryDocumentStore):
        if path is not None and Path(path).is_file():
            self.load(path)

+    def get(self, ids: Union[List[str], str]) -> List[Document]:
+        """Get document by id"""
+        if not isinstance(ids, list):
+            ids = [ids]
+
+        for doc_id in ids:
+            if doc_id not in self._store:
+                self.load(self._path)
+                break
+
+        return [self._store[doc_id] for doc_id in ids]
+
    def add(
        self,
        docs: Union[Document, List[Document]],
--- a/knowledgehub/storages/vectorstores/base.py
+++ b/knowledgehub/storages/vectorstores/base.py
@@ -76,8 +76,15 @@ class LlamaIndexVectorStore(BaseVectorStore):
                "Require `_li_class` to set a VectorStore class from LlamarIndex"
            )

+        from dataclasses import fields
+
        self._client = self._li_class(*args, **kwargs)

+        self._vsq_kwargs = {_.name for _ in fields(VectorStoreQuery)}
+        for key in ["query_embedding", "similarity_top_k", "node_ids"]:
+            if key in self._vsq_kwargs:
+                self._vsq_kwargs.remove(key)
+
    def __setattr__(self, name: str, value: Any) -> None:
        if name.startswith("_"):
            return super().__setattr__(name, value)
@@ -122,13 +129,35 @@ class LlamaIndexVectorStore(BaseVectorStore):
        ids: Optional[list[str]] = None,
        **kwargs,
    ) -> tuple[list[list[float]], list[float], list[str]]:
+        """Return the top k most similar vector embeddings
+
+        Args:
+            embedding: List of embeddings
+            top_k: Number of most similar embeddings to return
+            ids: List of ids of the embeddings to be queried
+            kwargs: extra query parameters. Depending on the name, these parameters
+                will be used when constructing the VectorStoreQuery object or when
+                performing querying of the underlying vector store.
+
+        Returns:
+            the matched embeddings, the similarity scores, and the ids
+        """
+        vsq_kwargs = {}
+        vs_kwargs = {}
+        for kwkey, kwvalue in kwargs.items():
+            if kwkey in self._vsq_kwargs:
+                vsq_kwargs[kwkey] = kwvalue
+            else:
+                vs_kwargs[kwkey] = kwvalue
+
        output = self._client.query(
            query=VectorStoreQuery(
                query_embedding=embedding,
                similarity_top_k=top_k,
                node_ids=ids,
-                **kwargs,
+                **vsq_kwargs,
            ),
+            **vs_kwargs,
        )

        embeddings = []
--- a/knowledgehub/storages/vectorstores/chroma.py
+++ b/knowledgehub/storages/vectorstores/chroma.py
@@ -64,7 +64,7 @@ class ChromaVectorStore(LlamaIndexVectorStore):
            ids: List of ids of the embeddings to be deleted
            kwargs: meant for vectorstore-specific parameters
        """
-        self._client._collection.delete(ids=ids)
+        self._client.client.delete(ids=ids)

    def delete_collection(self, collection_name: Optional[str] = None):
        """Delete entire collection under specified name from vector stores