Enforce all IO objects to be subclassed from Document (#88)

* enforce Document as IO * Separate rerankers, splitters and extractors (#85) * partially refractor importing * add text to embedding outputs --------- Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
2023-11-27 16:35:09 +07:00
parent 2186c5558f
commit 8e0779a22d
13 changed files with 108 additions and 59 deletions
--- a/knowledgehub/pipelines/cot.py
+++ b/knowledgehub/pipelines/cot.py
@@ -3,7 +3,7 @@ from typing import Callable, List

 from theflow import Function, Node, Param

-from kotaemon.base import BaseComponent
+from kotaemon.base import BaseComponent, Document
 from kotaemon.llms import LLM, BasePromptComponent
 from kotaemon.llms.chats.openai import AzureChatOpenAI

@@ -65,15 +65,19 @@ class Thought(BaseComponent):
    """

    prompt: str = Param(
-        help="The prompt template string. This prompt template has Python-like "
-        "variable placeholders, that then will be subsituted with real values when "
-        "this component is executed"
+        help=(
+            "The prompt template string. This prompt template has Python-like "
+            "variable placeholders, that then will be subsituted with real values when "
+            "this component is executed"
+        )
    )
    llm: LLM = Node(AzureChatOpenAI, help="The LLM model to execute the input prompt")
    post_process: Function = Node(
-        help="The function post-processor that post-processes LLM output prediction ."
-        "It should take a string as input (this is the LLM output text) and return "
-        "a dictionary, where the key should"
+        help=(
+            "The function post-processor that post-processes LLM output prediction ."
+            "It should take a string as input (this is the LLM output text) and return "
+            "a dictionary, where the key should"
+        )
    )

    @Node.auto(depends_on="prompt")
@@ -81,11 +85,13 @@ class Thought(BaseComponent):
        """Automatically wrap around param prompt. Can ignore"""
        return BasePromptComponent(self.prompt)

-    def run(self, **kwargs) -> dict:
+    def run(self, **kwargs) -> Document:
        """Run the chain of thought"""
        prompt = self.prompt_template(**kwargs).text
        response = self.llm(prompt).text
-        return self.post_process(response)
+        response = self.post_process(response)
+
+        return Document(response)

    def get_variables(self) -> List[str]:
        return []
@@ -146,7 +152,7 @@ class ManualSequentialChainOfThought(BaseComponent):
        help="Callback on terminate condition. Default to always return False",
    )

-    def run(self, **kwargs) -> dict:
+    def run(self, **kwargs) -> Document:
        """Run the manual chain of thought"""

        inputs = deepcopy(kwargs)
@@ -156,11 +162,11 @@ class ManualSequentialChainOfThought(BaseComponent):
            self._prepare_child(thought, f"thought{idx}")

            output = thought(**inputs)
-            inputs.update(output)
+            inputs.update(output.content)
            if self.terminate(inputs):
                break

-        return inputs
+        return Document(inputs)

    def __add__(self, next_thought: Thought) -> "ManualSequentialChainOfThought":
        return ManualSequentialChainOfThought(
--- a/knowledgehub/pipelines/retrieving.py
+++ b/knowledgehub/pipelines/retrieving.py
@@ -3,12 +3,10 @@ from __future__ import annotations
 from pathlib import Path
 from typing import Optional, Sequence

+from kotaemon.base import BaseComponent, Document, RetrievedDocument
+from kotaemon.embeddings import BaseEmbeddings
 from kotaemon.indices.rankings import BaseReranking
-
-from ..base import BaseComponent
-from ..base.schema import Document, RetrievedDocument
-from ..embeddings import BaseEmbeddings
-from ..storages import BaseDocumentStore, BaseVectorStore
+from kotaemon.storages import BaseDocumentStore, BaseVectorStore

 VECTOR_STORE_FNAME = "vectorstore"
 DOC_STORE_FNAME = "docstore"
@@ -45,7 +43,7 @@ class RetrieveDocumentFromVectorStorePipeline(BaseComponent):
                "retrieve the documents"
            )

-        emb: list[float] = self.embedding(text)[0]
+        emb: list[float] = self.embedding(text)[0].embedding
        _, scores, ids = self.vector_store.query(embedding=emb, top_k=top_k)
        docs = self.doc_store.get(ids)
        result = [