Upgrade the declarative pipeline for cleaner interface (#51)

2023-10-24 11:12:22 +07:00
parent aab982ddc4
commit 9035e25666
26 changed files with 365 additions and 169 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,3 +13,17 @@ def mock_google_search(monkeypatch):
        )

    monkeypatch.setattr(googlesearch, "search", result)
+
+
+def if_haystack_not_installed():
+    try:
+        import haystack  # noqa: F401
+    except ImportError:
+        return True
+    else:
+        return False
+
+
+skip_when_haystack_not_installed = pytest.mark.skipif(
+    if_haystack_not_installed(), reason="Haystack is not installed"
+)
--- a/tests/simple_pipeline.py
+++ b/tests/simple_pipeline.py
@@ -1,7 +1,7 @@
 import tempfile
 from typing import List

-from theflow import Node
+from theflow.utils.modules import ObjectInitDeclaration as _

 from kotaemon.base import BaseComponent
 from kotaemon.embeddings import AzureOpenAIEmbeddings
@@ -11,33 +11,27 @@ from kotaemon.vectorstores import ChromaVectorStore


 class Pipeline(BaseComponent):
-    vectorstore_path: str = str(tempfile.mkdtemp())
-    llm: Node[AzureOpenAI] = Node(
-        default=AzureOpenAI,
-        default_kwargs={
-            "openai_api_base": "https://test.openai.azure.com/",
-            "openai_api_key": "some-key",
-            "openai_api_version": "2023-03-15-preview",
-            "deployment_name": "gpt35turbo",
-            "temperature": 0,
-            "request_timeout": 60,
-        },
+    llm: AzureOpenAI = AzureOpenAI.withx(
+        openai_api_base="https://test.openai.azure.com/",
+        openai_api_key="some-key",
+        openai_api_version="2023-03-15-preview",
+        deployment_name="gpt35turbo",
+        temperature=0,
+        request_timeout=60,
    )

-    @Node.decorate(depends_on=["vectorstore_path"])
-    def retrieving_pipeline(self):
-        vector_store = ChromaVectorStore(self.vectorstore_path)
-        embedding = AzureOpenAIEmbeddings(
-            model="text-embedding-ada-002",
-            deployment="embedding-deployment",
-            openai_api_base="https://test.openai.azure.com/",
-            openai_api_key="some-key",
-        )
-
-        return RetrieveDocumentFromVectorStorePipeline(
-            vector_store=vector_store, embedding=embedding
+    retrieving_pipeline: RetrieveDocumentFromVectorStorePipeline = (
+        RetrieveDocumentFromVectorStorePipeline.withx(
+            vector_store=_(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())),
+            embedding=AzureOpenAIEmbeddings.withx(
+                model="text-embedding-ada-002",
+                deployment="embedding-deployment",
+                openai_api_base="https://test.openai.azure.com/",
+                openai_api_key="some-key",
+            ),
        )
+    )

    def run_raw(self, text: str) -> str:
        matched_texts: List[str] = self.retrieving_pipeline(text)
-        return self.llm("\n".join(matched_texts)).text[0]
+        return self.llm("\n".join(matched_texts)).text
--- a/tests/test_documents.py
+++ b/tests/test_documents.py
@@ -1,7 +1,7 @@
-from haystack.schema import Document as HaystackDocument
-
 from kotaemon.documents.base import Document, RetrievedDocument

+from .conftest import skip_when_haystack_not_installed
+

 def test_document_constructor_with_builtin_types():
    for value in ["str", 1, {}, set(), [], tuple, None]:
@@ -19,7 +19,10 @@ def test_document_constructor_with_document():
    assert doc2.content == doc1.content


+@skip_when_haystack_not_installed
 def test_document_to_haystack_format():
+    from haystack.schema import Document as HaystackDocument
+
    text = "Sample text"
    metadata = {"filename": "sample.txt"}
    doc = Document(text, metadata=metadata)
--- a/tests/test_promptui.py
+++ b/tests/test_promptui.py
@@ -16,7 +16,6 @@ class TestPromptConfig:
        assert "text" in config["inputs"], "inputs should have config"

        assert "params" in config, "params should be in config"
-        assert "vectorstore_path" in config["params"]
        assert "llm.deployment_name" in config["params"]
        assert "llm.openai_api_base" in config["params"]
        assert "llm.openai_api_key" in config["params"]
--- a/tests/test_qa.py
+++ b/tests/test_qa.py
@@ -42,8 +42,9 @@ def mock_openai_embedding(monkeypatch):
 )
 def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path):
    indexing_pipeline = ReaderIndexingPipeline(
-        storage=tmp_path, openai_api_key="some-key"
+        storage_path=tmp_path,
    )
+    indexing_pipeline.embedding.openai_api_key = "some-key"
    input_file_path = Path(__file__).parent / "resources/dummy.pdf"

    # call ingestion pipeline
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -3,7 +3,7 @@ from pathlib import Path
 from langchain.schema import Document as LangchainDocument
 from llama_index.node_parser import SimpleNodeParser

-from kotaemon.documents.base import Document, HaystackDocument
+from kotaemon.documents.base import Document
 from kotaemon.loaders import AutoReader


@@ -19,10 +19,6 @@ def test_pdf_reader():
    assert isinstance(first_doc, Document)
    assert first_doc.text.lower().replace(" ", "") == "dummypdffile"

-    # check conversion output
-    haystack_doc = first_doc.to_haystack_format()
-    assert isinstance(haystack_doc, HaystackDocument)
-
    langchain_doc = first_doc.to_langchain_format()
    assert isinstance(langchain_doc, LangchainDocument)

--- a/tests/test_telemetry.py
+++ b/tests/test_telemetry.py
@@ -3,6 +3,8 @@ import sys

 import pytest

+from .conftest import skip_when_haystack_not_installed
+

@pytest.fixture
 def clean_artifacts_for_telemetry():
@@ -26,6 +28,7 @@ def clean_artifacts_for_telemetry():


@pytest.mark.usefixtures("clean_artifacts_for_telemetry")
+@skip_when_haystack_not_installed
 def test_disable_telemetry_import_haystack_first():
    """Test that telemetry is disabled when kotaemon lib is initiated after"""
    import os
@@ -42,6 +45,7 @@ def test_disable_telemetry_import_haystack_first():


@pytest.mark.usefixtures("clean_artifacts_for_telemetry")
+@skip_when_haystack_not_installed
 def test_disable_telemetry_import_haystack_after_kotaemon():
    """Test that telemetry is disabled when kotaemon lib is initiated before"""
    import os