[AUR-408] Export logs to Excel (#23)

This CL implements: - The logic to export log to Excel. - Route the export logic in the UI. - Demonstrate this functionality in `./examples/promptui` project.
2023-09-25 17:20:03 +07:00
parent 08b6e5d3fb
commit 4f189dc931
5 changed files with 265 additions and 64 deletions
--- a/knowledgehub/contribs/promptui/export.py
+++ b/knowledgehub/contribs/promptui/export.py
@@ -1 +1,138 @@
 """Export logs into Excel file"""
 import os
 import pickle
 from pathlib import Path
 from typing import Any, Dict, List, Type, Union
 import pandas as pd
 import yaml
 from theflow.storage import storage
 from theflow.utils.modules import import_dotted_string
 from kotaemon.base import BaseComponent
 def from_log_to_dict(pipeline_cls: Type[BaseComponent], log_config: dict) -> dict:
    """Export the log to panda dataframes
    Args:
        pipeline_cls (Type[BaseComponent]): Pipeline class
        log_config (dict): Log config
    Returns:
        dataframe
    """
    # get the directory
    pipeline_log_path = storage.url(pipeline_cls().config.store_result)
    dirs = list(sorted([f.path for f in os.scandir(pipeline_log_path) if f.is_dir()]))
    ids = []
    params: Dict[str, List[Any]] = {}
    inputs: Dict[str, List[Any]] = {}
    outputs: Dict[str, List[Any]] = {}
    for idx, each_dir in enumerate(dirs):
        ids.append(str(Path(each_dir).name))
        # get the params
        params_file = os.path.join(each_dir, "params.pkl")
        if os.path.exists(params_file):
            with open(params_file, "rb") as f:
                each_params = pickle.load(f)
            for key, value in each_params.items():
                if key not in params:
                    params[key] = [None] * len(dirs)
                params[key][idx] = value
        progress_file = os.path.join(each_dir, "progress.pkl")
        if os.path.exists(progress_file):
            with open(progress_file, "rb") as f:
                progress = pickle.load(f)
            # get the inputs
            for each_input in log_config["inputs"]:
                name = each_input["name"]
                step = each_input["step"]
                if name not in inputs:
                    inputs[name] = [None] * len(dirs)
                variable = each_input.get("variable", "")
                if variable:
                    inputs[name][idx] = progress[step]["input"]["kwargs"][variable]
                else:
                    inputs[name][idx] = progress[step]["input"]
            # get the outputs
            for each_output in log_config["outputs"]:
                name = each_output["name"]
                step = each_output["step"]
                if name not in outputs:
                    outputs[name] = [None] * len(dirs)
                outputs[name][idx] = progress[step]["output"]
                if each_output.get("item", ""):
                    outputs[name][idx] = outputs[name][each_output["item"]]
    return {"ids": ids, **params, **inputs, **outputs}
 def export(config: dict, pipeline_def, output_path):
    """Export from config to Excel file"""
    pipeline_name = f"{pipeline_def.__module__}.{pipeline_def.__name__}"
    # export to Excel
    if not config.get("logs", {}):
        raise ValueError(f"Pipeline {pipeline_name} has no logs to export")
    pds: Dict[str, pd.DataFrame] = {}
    for log_name, log_def in config["logs"].items():
        pds[log_name] = pd.DataFrame(from_log_to_dict(pipeline_def, log_def))
    # from the list of pds, export to Excel to output_path
    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:  # type: ignore
        for log_name, df in pds.items():
            df.to_excel(writer, sheet_name=log_name)
 def export_from_dict(
    config: Union[str, dict],
    pipeline: Union[str, Type[BaseComponent]],
    output_path: str,
 ):
    """CLI to export the logs of a pipeline into Excel file
    Args:
        config_path (str): Path to the config file
        pipeline_name (str): Name of the pipeline
        output_path (str): Path to the output Excel file
    """
    # get the pipeline class and the relevant config dict
    config_dict: dict
    if isinstance(config, str):
        with open(config) as f:
            config_dict = yaml.safe_load(f)
    elif isinstance(config, dict):
        config_dict = config
    else:
        raise TypeError(f"`config` must be str or dict, not {type(config)}")
    pipeline_name: str
    pipeline_cls: Type[BaseComponent]
    pipeline_config: dict
    if isinstance(pipeline, str):
        if pipeline not in config_dict:
            raise ValueError(f"Pipeline {pipeline} not found in config file")
        pipeline_name = pipeline
        pipeline_cls = import_dotted_string(pipeline, safe=False)
        pipeline_config = config_dict[pipeline]
    elif isinstance(pipeline, type) and issubclass(pipeline, BaseComponent):
        pipeline_name = f"{pipeline.__module__}.{pipeline.__name__}"
        if pipeline_name not in config_dict:
            raise ValueError(f"Pipeline {pipeline_name} not found in config file")
        pipeline_cls = pipeline
        pipeline_config = config_dict[pipeline_name]
    else:
        raise TypeError(
            f"`pipeline` must be str or subclass of BaseComponent, not {type(pipeline)}"
        )
    export(pipeline_config, pipeline_cls, output_path)
--- a/knowledgehub/contribs/promptui/ui.py
+++ b/knowledgehub/contribs/promptui/ui.py
@@ -1,13 +1,20 @@
 import pickle
 from datetime import datetime
 from pathlib import Path
 from typing import Union
 import gradio as gr
 import yaml
 from theflow.storage import storage
 from theflow.utils.modules import import_dotted_string
 from kotaemon.contribs.promptui.base import COMPONENTS_CLASS, SUPPORTED_COMPONENTS
 from kotaemon.contribs.promptui.export import export
 USAGE_INSTRUCTION = """In case of errors, you can:
 - PromptUI instruction:
    https://github.com/Cinnamon/kotaemon/wiki/Utilities#prompt-engineering-ui
 - Create bug fix and make PR at: https://github.com/Cinnamon/kotaemon
 - Ping any of @john @tadashi @ian @jacky in Slack channel #llm-productization"""
@@ -73,6 +80,8 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
        outputs.append(component)
    exported_file = gr.File(label="Output file", show_label=True)
    temp = gr.Tab
    with gr.Blocks(analytics_enabled=False, title="Welcome to PromptUI") as demo:
        with gr.Accordion(label="Usage", open=False):
@@ -80,8 +89,10 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
        with gr.Row():
            run_btn = gr.Button("Run")
            run_btn.click(func_run, inputs=inputs + params, outputs=outputs)
-            export_btn = gr.Button("Export")
+            export_btn = gr.Button(
-            export_btn.click(func_export, inputs=None, outputs=None)
+                "Export (Result will be in Exported file next to Output)"
            )
            export_btn.click(func_export, inputs=None, outputs=exported_file)
        with gr.Row():
            with gr.Column():
                with temp("Inputs"):
@@ -91,8 +102,11 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
                    for component in params:
                        component.render()
            with gr.Column():
-                for component in outputs:
+                with temp("Outputs"):
-                    component.render()
+                    for component in outputs:
                        component.render()
                with temp("Exported file"):
                    exported_file.render()
    return demo
@@ -103,6 +117,10 @@ def build_pipeline_ui(config: dict, pipeline_def):
    params_name = list(config.get("params", {}).keys())
    outputs_def = config.get("outputs", [])
    output_dir: Path = Path(storage.url(pipeline_def().config.store_result))
    exported_dir = output_dir.parent / "exported"
    exported_dir.mkdir(parents=True, exist_ok=True)
    def run_func(*args):
        inputs = {
            name: value for name, value in zip(inputs_name, args[: len(inputs_name)])
@@ -113,6 +131,13 @@ def build_pipeline_ui(config: dict, pipeline_def):
        pipeline = pipeline_def()
        pipeline.set(params)
        pipeline(**inputs)
        with storage.open(
            storage.url(
                pipeline.config.store_result, pipeline.last_run.id(), "params.pkl"
            ),
            "wb",
        ) as f:
            pickle.dump(params, f)
        if outputs_def:
            outputs = []
            for output_def in outputs_def:
@@ -122,8 +147,20 @@ def build_pipeline_ui(config: dict, pipeline_def):
                outputs.append(output)
            return outputs
-    # TODO: export_func is None for now
+    def export_func():
-    return construct_ui(config, run_func, None)
+        name = (
            f"{pipeline_def.__module__}.{pipeline_def.__name__}_{datetime.now()}.xlsx"
        )
        path = str(exported_dir / name)
        gr.Info(f"Begin exporting {name}...")
        try:
            export(config=config, pipeline_def=pipeline_def, output_path=path)
        except Exception as e:
            raise gr.Error(f"Failed to export. Please contact project's AIR: {e}")
        gr.Info(f"Exported {name}. Please go to the `Exported file` tab to download")
        return path
    return construct_ui(config, run_func, export_func)
 def build_from_dict(config: Union[str, dict]):
@@ -148,4 +185,6 @@ def build_from_dict(config: Union[str, dict]):
    else:
        demo = gr.TabbedInterface(demos, list(config_dict.keys()))
    demo.queue()
    return demo
--- a/setup.py
+++ b/setup.py
@@ -35,6 +35,7 @@ setuptools.setup(
        "llama-hub",
        "nltk",
        "gradio",
        "openpyxl",
    ],
    extras_require={
        "dev": [
--- a/tests/simple_pipeline.py
+++ b/tests/simple_pipeline.py
@@ -0,0 +1,43 @@
 import tempfile
 from typing import List
 from theflow import Node
 from kotaemon.base import BaseComponent
 from kotaemon.embeddings import AzureOpenAIEmbeddings
 from kotaemon.llms.completions.openai import AzureOpenAI
 from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
 from kotaemon.vectorstores import ChromaVectorStore
 class Pipeline(BaseComponent):
    vectorstore_path: str = str(tempfile.mkdtemp())
    llm: Node[AzureOpenAI] = Node(
        default=AzureOpenAI,
        default_kwargs={
            "openai_api_base": "https://test.openai.azure.com/",
            "openai_api_key": "some-key",
            "openai_api_version": "2023-03-15-preview",
            "deployment_name": "gpt35turbo",
            "temperature": 0,
            "request_timeout": 60,
        },
    )
    @Node.decorate(depends_on=["vectorstore_path"])
    def retrieving_pipeline(self):
        vector_store = ChromaVectorStore(self.vectorstore_path)
        embedding = AzureOpenAIEmbeddings(
            model="text-embedding-ada-002",
            deployment="embedding-deployment",
            openai_api_base="https://test.openai.azure.com/",
            openai_api_key="some-key",
        )
        return RetrieveDocumentFromVectorStorePipeline(
            vector_store=vector_store, embedding=embedding
        )
    def run_raw(self, text: str) -> str:
        matched_texts: List[str] = self.retrieving_pipeline(text)
        return self.llm("\n".join(matched_texts)).text[0]
--- a/tests/test_promptui.py
+++ b/tests/test_promptui.py
@@ -1,66 +1,14 @@
 import pytest
 from kotaemon.contribs.promptui.config import export_pipeline_to_config
 from kotaemon.contribs.promptui.export import export_from_dict
 from kotaemon.contribs.promptui.ui import build_from_dict
-
+from .simple_pipeline import Pipeline
@pytest.fixture()
 def simple_pipeline_cls(tmp_path):
    """Create a pipeline class that can be used"""
    from typing import List
    from theflow import Node
    from kotaemon.base import BaseComponent
    from kotaemon.embeddings import AzureOpenAIEmbeddings
    from kotaemon.llms.completions.openai import AzureOpenAI
    from kotaemon.pipelines.retrieving import (
        RetrieveDocumentFromVectorStorePipeline,
    )
    from kotaemon.vectorstores import ChromaVectorStore
    class Pipeline(BaseComponent):
        vectorstore_path: str = str(tmp_path)
        llm: Node[AzureOpenAI] = Node(
            default=AzureOpenAI,
            default_kwargs={
                "openai_api_base": "https://test.openai.azure.com/",
                "openai_api_key": "some-key",
                "openai_api_version": "2023-03-15-preview",
                "deployment_name": "gpt35turbo",
                "temperature": 0,
                "request_timeout": 60,
            },
        )
        @Node.decorate(depends_on=["vectorstore_path"])
        def retrieving_pipeline(self):
            vector_store = ChromaVectorStore(self.vectorstore_path)
            embedding = AzureOpenAIEmbeddings(
                model="text-embedding-ada-002",
                deployment="embedding-deployment",
                openai_api_base="https://test.openai.azure.com/",
                openai_api_key="some-key",
            )
            return RetrieveDocumentFromVectorStorePipeline(
                vector_store=vector_store, embedding=embedding
            )
        def run_raw(self, text: str) -> str:
            matched_texts: List[str] = self.retrieving_pipeline(text)
            return self.llm("\n".join(matched_texts)).text[0]
    return Pipeline
 Pipeline = simple_pipeline_cls
 class TestPromptConfig:
-    def test_export_prompt_config(self, simple_pipeline_cls):
+    def test_export_prompt_config(self):
        """Test if the prompt config is exported correctly"""
-        pipeline = simple_pipeline_cls()
+        pipeline = Pipeline()
        config_dict = export_pipeline_to_config(pipeline)
        config = list(config_dict.values())[0]
@@ -78,9 +26,42 @@ class TestPromptConfig:
 class TestPromptUI:
-    def test_uigeneration(self, simple_pipeline_cls):
+    def test_uigeneration(self):
        """Test if the gradio UI is exposed without any problem"""
-        pipeline = simple_pipeline_cls()
+        pipeline = Pipeline()
        config = export_pipeline_to_config(pipeline)
        build_from_dict(config)
 class TestExport:
    def test_export(self, tmp_path):
        """Test if the export functionality works without error"""
        from pathlib import Path
        import yaml
        from theflow.storage import storage
        config_path = tmp_path / "config.yaml"
        pipeline = Pipeline()
        Path(storage.url(pipeline.config.store_result)).mkdir(
            parents=True, exist_ok=True
        )
        config_dict = export_pipeline_to_config(pipeline)
        pipeline_name = list(config_dict.keys())[0]
        config_dict[pipeline_name]["logs"] = {
            "sheet1": {
                "inputs": [{"name": "text", "step": ".", "variable": "text"}],
                "outputs": [{"name": "answer", "step": "."}],
            },
        }
        with open(config_path, "w") as f:
            yaml.safe_dump(config_dict, f)
        export_from_dict(
            config=str(config_path),
            pipeline=pipeline_name,
            output_path=str(tmp_path / "exported.xlsx"),
        )