Upgrade the declarative pipeline for cleaner interface (#51)

This commit is contained in:
Nguyen Trung Duc (john)
2023-10-24 11:12:22 +07:00
committed by GitHub
parent aab982ddc4
commit 9035e25666
26 changed files with 365 additions and 169 deletions

View File

@@ -13,3 +13,17 @@ def mock_google_search(monkeypatch):
)
monkeypatch.setattr(googlesearch, "search", result)
def if_haystack_not_installed():
try:
import haystack # noqa: F401
except ImportError:
return True
else:
return False
skip_when_haystack_not_installed = pytest.mark.skipif(
if_haystack_not_installed(), reason="Haystack is not installed"
)

View File

@@ -1,7 +1,7 @@
import tempfile
from typing import List
from theflow import Node
from theflow.utils.modules import ObjectInitDeclaration as _
from kotaemon.base import BaseComponent
from kotaemon.embeddings import AzureOpenAIEmbeddings
@@ -11,33 +11,27 @@ from kotaemon.vectorstores import ChromaVectorStore
class Pipeline(BaseComponent):
vectorstore_path: str = str(tempfile.mkdtemp())
llm: Node[AzureOpenAI] = Node(
default=AzureOpenAI,
default_kwargs={
"openai_api_base": "https://test.openai.azure.com/",
"openai_api_key": "some-key",
"openai_api_version": "2023-03-15-preview",
"deployment_name": "gpt35turbo",
"temperature": 0,
"request_timeout": 60,
},
llm: AzureOpenAI = AzureOpenAI.withx(
openai_api_base="https://test.openai.azure.com/",
openai_api_key="some-key",
openai_api_version="2023-03-15-preview",
deployment_name="gpt35turbo",
temperature=0,
request_timeout=60,
)
@Node.decorate(depends_on=["vectorstore_path"])
def retrieving_pipeline(self):
vector_store = ChromaVectorStore(self.vectorstore_path)
embedding = AzureOpenAIEmbeddings(
model="text-embedding-ada-002",
deployment="embedding-deployment",
openai_api_base="https://test.openai.azure.com/",
openai_api_key="some-key",
)
return RetrieveDocumentFromVectorStorePipeline(
vector_store=vector_store, embedding=embedding
retrieving_pipeline: RetrieveDocumentFromVectorStorePipeline = (
RetrieveDocumentFromVectorStorePipeline.withx(
vector_store=_(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())),
embedding=AzureOpenAIEmbeddings.withx(
model="text-embedding-ada-002",
deployment="embedding-deployment",
openai_api_base="https://test.openai.azure.com/",
openai_api_key="some-key",
),
)
)
def run_raw(self, text: str) -> str:
matched_texts: List[str] = self.retrieving_pipeline(text)
return self.llm("\n".join(matched_texts)).text[0]
return self.llm("\n".join(matched_texts)).text

View File

@@ -1,7 +1,7 @@
from haystack.schema import Document as HaystackDocument
from kotaemon.documents.base import Document, RetrievedDocument
from .conftest import skip_when_haystack_not_installed
def test_document_constructor_with_builtin_types():
for value in ["str", 1, {}, set(), [], tuple, None]:
@@ -19,7 +19,10 @@ def test_document_constructor_with_document():
assert doc2.content == doc1.content
@skip_when_haystack_not_installed
def test_document_to_haystack_format():
from haystack.schema import Document as HaystackDocument
text = "Sample text"
metadata = {"filename": "sample.txt"}
doc = Document(text, metadata=metadata)

View File

@@ -16,7 +16,6 @@ class TestPromptConfig:
assert "text" in config["inputs"], "inputs should have config"
assert "params" in config, "params should be in config"
assert "vectorstore_path" in config["params"]
assert "llm.deployment_name" in config["params"]
assert "llm.openai_api_base" in config["params"]
assert "llm.openai_api_key" in config["params"]

View File

@@ -42,8 +42,9 @@ def mock_openai_embedding(monkeypatch):
)
def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path):
indexing_pipeline = ReaderIndexingPipeline(
storage=tmp_path, openai_api_key="some-key"
storage_path=tmp_path,
)
indexing_pipeline.embedding.openai_api_key = "some-key"
input_file_path = Path(__file__).parent / "resources/dummy.pdf"
# call ingestion pipeline

View File

@@ -3,7 +3,7 @@ from pathlib import Path
from langchain.schema import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser
from kotaemon.documents.base import Document, HaystackDocument
from kotaemon.documents.base import Document
from kotaemon.loaders import AutoReader
@@ -19,10 +19,6 @@ def test_pdf_reader():
assert isinstance(first_doc, Document)
assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
# check conversion output
haystack_doc = first_doc.to_haystack_format()
assert isinstance(haystack_doc, HaystackDocument)
langchain_doc = first_doc.to_langchain_format()
assert isinstance(langchain_doc, LangchainDocument)

View File

@@ -3,6 +3,8 @@ import sys
import pytest
from .conftest import skip_when_haystack_not_installed
@pytest.fixture
def clean_artifacts_for_telemetry():
@@ -26,6 +28,7 @@ def clean_artifacts_for_telemetry():
@pytest.mark.usefixtures("clean_artifacts_for_telemetry")
@skip_when_haystack_not_installed
def test_disable_telemetry_import_haystack_first():
"""Test that telemetry is disabled when kotaemon lib is initiated after"""
import os
@@ -42,6 +45,7 @@ def test_disable_telemetry_import_haystack_first():
@pytest.mark.usefixtures("clean_artifacts_for_telemetry")
@skip_when_haystack_not_installed
def test_disable_telemetry_import_haystack_after_kotaemon():
"""Test that telemetry is disabled when kotaemon lib is initiated before"""
import os