[AUR-408] Export logs to Excel (#23)

This CL implements:

- The logic to export log to Excel.
- Route the export logic in the UI.
- Demonstrate this functionality in `./examples/promptui` project.
This commit is contained in:
Nguyen Trung Duc (john) 2023-09-25 17:20:03 +07:00 committed by GitHub
parent 08b6e5d3fb
commit 4f189dc931
5 changed files with 265 additions and 64 deletions

View File

@ -1 +1,138 @@
"""Export logs into Excel file""" """Export logs into Excel file"""
import os
import pickle
from pathlib import Path
from typing import Any, Dict, List, Type, Union
import pandas as pd
import yaml
from theflow.storage import storage
from theflow.utils.modules import import_dotted_string
from kotaemon.base import BaseComponent
def from_log_to_dict(pipeline_cls: Type[BaseComponent], log_config: dict) -> dict:
"""Export the log to panda dataframes
Args:
pipeline_cls (Type[BaseComponent]): Pipeline class
log_config (dict): Log config
Returns:
dataframe
"""
# get the directory
pipeline_log_path = storage.url(pipeline_cls().config.store_result)
dirs = list(sorted([f.path for f in os.scandir(pipeline_log_path) if f.is_dir()]))
ids = []
params: Dict[str, List[Any]] = {}
inputs: Dict[str, List[Any]] = {}
outputs: Dict[str, List[Any]] = {}
for idx, each_dir in enumerate(dirs):
ids.append(str(Path(each_dir).name))
# get the params
params_file = os.path.join(each_dir, "params.pkl")
if os.path.exists(params_file):
with open(params_file, "rb") as f:
each_params = pickle.load(f)
for key, value in each_params.items():
if key not in params:
params[key] = [None] * len(dirs)
params[key][idx] = value
progress_file = os.path.join(each_dir, "progress.pkl")
if os.path.exists(progress_file):
with open(progress_file, "rb") as f:
progress = pickle.load(f)
# get the inputs
for each_input in log_config["inputs"]:
name = each_input["name"]
step = each_input["step"]
if name not in inputs:
inputs[name] = [None] * len(dirs)
variable = each_input.get("variable", "")
if variable:
inputs[name][idx] = progress[step]["input"]["kwargs"][variable]
else:
inputs[name][idx] = progress[step]["input"]
# get the outputs
for each_output in log_config["outputs"]:
name = each_output["name"]
step = each_output["step"]
if name not in outputs:
outputs[name] = [None] * len(dirs)
outputs[name][idx] = progress[step]["output"]
if each_output.get("item", ""):
outputs[name][idx] = outputs[name][each_output["item"]]
return {"ids": ids, **params, **inputs, **outputs}
def export(config: dict, pipeline_def, output_path):
"""Export from config to Excel file"""
pipeline_name = f"{pipeline_def.__module__}.{pipeline_def.__name__}"
# export to Excel
if not config.get("logs", {}):
raise ValueError(f"Pipeline {pipeline_name} has no logs to export")
pds: Dict[str, pd.DataFrame] = {}
for log_name, log_def in config["logs"].items():
pds[log_name] = pd.DataFrame(from_log_to_dict(pipeline_def, log_def))
# from the list of pds, export to Excel to output_path
with pd.ExcelWriter(output_path, engine="openpyxl") as writer: # type: ignore
for log_name, df in pds.items():
df.to_excel(writer, sheet_name=log_name)
def export_from_dict(
config: Union[str, dict],
pipeline: Union[str, Type[BaseComponent]],
output_path: str,
):
"""CLI to export the logs of a pipeline into Excel file
Args:
config_path (str): Path to the config file
pipeline_name (str): Name of the pipeline
output_path (str): Path to the output Excel file
"""
# get the pipeline class and the relevant config dict
config_dict: dict
if isinstance(config, str):
with open(config) as f:
config_dict = yaml.safe_load(f)
elif isinstance(config, dict):
config_dict = config
else:
raise TypeError(f"`config` must be str or dict, not {type(config)}")
pipeline_name: str
pipeline_cls: Type[BaseComponent]
pipeline_config: dict
if isinstance(pipeline, str):
if pipeline not in config_dict:
raise ValueError(f"Pipeline {pipeline} not found in config file")
pipeline_name = pipeline
pipeline_cls = import_dotted_string(pipeline, safe=False)
pipeline_config = config_dict[pipeline]
elif isinstance(pipeline, type) and issubclass(pipeline, BaseComponent):
pipeline_name = f"{pipeline.__module__}.{pipeline.__name__}"
if pipeline_name not in config_dict:
raise ValueError(f"Pipeline {pipeline_name} not found in config file")
pipeline_cls = pipeline
pipeline_config = config_dict[pipeline_name]
else:
raise TypeError(
f"`pipeline` must be str or subclass of BaseComponent, not {type(pipeline)}"
)
export(pipeline_config, pipeline_cls, output_path)

View File

@ -1,13 +1,20 @@
import pickle
from datetime import datetime
from pathlib import Path
from typing import Union from typing import Union
import gradio as gr import gradio as gr
import yaml import yaml
from theflow.storage import storage
from theflow.utils.modules import import_dotted_string from theflow.utils.modules import import_dotted_string
from kotaemon.contribs.promptui.base import COMPONENTS_CLASS, SUPPORTED_COMPONENTS from kotaemon.contribs.promptui.base import COMPONENTS_CLASS, SUPPORTED_COMPONENTS
from kotaemon.contribs.promptui.export import export
USAGE_INSTRUCTION = """In case of errors, you can: USAGE_INSTRUCTION = """In case of errors, you can:
- PromptUI instruction:
https://github.com/Cinnamon/kotaemon/wiki/Utilities#prompt-engineering-ui
- Create bug fix and make PR at: https://github.com/Cinnamon/kotaemon - Create bug fix and make PR at: https://github.com/Cinnamon/kotaemon
- Ping any of @john @tadashi @ian @jacky in Slack channel #llm-productization""" - Ping any of @john @tadashi @ian @jacky in Slack channel #llm-productization"""
@ -73,6 +80,8 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
outputs.append(component) outputs.append(component)
exported_file = gr.File(label="Output file", show_label=True)
temp = gr.Tab temp = gr.Tab
with gr.Blocks(analytics_enabled=False, title="Welcome to PromptUI") as demo: with gr.Blocks(analytics_enabled=False, title="Welcome to PromptUI") as demo:
with gr.Accordion(label="Usage", open=False): with gr.Accordion(label="Usage", open=False):
@ -80,8 +89,10 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
with gr.Row(): with gr.Row():
run_btn = gr.Button("Run") run_btn = gr.Button("Run")
run_btn.click(func_run, inputs=inputs + params, outputs=outputs) run_btn.click(func_run, inputs=inputs + params, outputs=outputs)
export_btn = gr.Button("Export") export_btn = gr.Button(
export_btn.click(func_export, inputs=None, outputs=None) "Export (Result will be in Exported file next to Output)"
)
export_btn.click(func_export, inputs=None, outputs=exported_file)
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
with temp("Inputs"): with temp("Inputs"):
@ -91,8 +102,11 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
for component in params: for component in params:
component.render() component.render()
with gr.Column(): with gr.Column():
for component in outputs: with temp("Outputs"):
component.render() for component in outputs:
component.render()
with temp("Exported file"):
exported_file.render()
return demo return demo
@ -103,6 +117,10 @@ def build_pipeline_ui(config: dict, pipeline_def):
params_name = list(config.get("params", {}).keys()) params_name = list(config.get("params", {}).keys())
outputs_def = config.get("outputs", []) outputs_def = config.get("outputs", [])
output_dir: Path = Path(storage.url(pipeline_def().config.store_result))
exported_dir = output_dir.parent / "exported"
exported_dir.mkdir(parents=True, exist_ok=True)
def run_func(*args): def run_func(*args):
inputs = { inputs = {
name: value for name, value in zip(inputs_name, args[: len(inputs_name)]) name: value for name, value in zip(inputs_name, args[: len(inputs_name)])
@ -113,6 +131,13 @@ def build_pipeline_ui(config: dict, pipeline_def):
pipeline = pipeline_def() pipeline = pipeline_def()
pipeline.set(params) pipeline.set(params)
pipeline(**inputs) pipeline(**inputs)
with storage.open(
storage.url(
pipeline.config.store_result, pipeline.last_run.id(), "params.pkl"
),
"wb",
) as f:
pickle.dump(params, f)
if outputs_def: if outputs_def:
outputs = [] outputs = []
for output_def in outputs_def: for output_def in outputs_def:
@ -122,8 +147,20 @@ def build_pipeline_ui(config: dict, pipeline_def):
outputs.append(output) outputs.append(output)
return outputs return outputs
# TODO: export_func is None for now def export_func():
return construct_ui(config, run_func, None) name = (
f"{pipeline_def.__module__}.{pipeline_def.__name__}_{datetime.now()}.xlsx"
)
path = str(exported_dir / name)
gr.Info(f"Begin exporting {name}...")
try:
export(config=config, pipeline_def=pipeline_def, output_path=path)
except Exception as e:
raise gr.Error(f"Failed to export. Please contact project's AIR: {e}")
gr.Info(f"Exported {name}. Please go to the `Exported file` tab to download")
return path
return construct_ui(config, run_func, export_func)
def build_from_dict(config: Union[str, dict]): def build_from_dict(config: Union[str, dict]):
@ -148,4 +185,6 @@ def build_from_dict(config: Union[str, dict]):
else: else:
demo = gr.TabbedInterface(demos, list(config_dict.keys())) demo = gr.TabbedInterface(demos, list(config_dict.keys()))
demo.queue()
return demo return demo

View File

@ -35,6 +35,7 @@ setuptools.setup(
"llama-hub", "llama-hub",
"nltk", "nltk",
"gradio", "gradio",
"openpyxl",
], ],
extras_require={ extras_require={
"dev": [ "dev": [

43
tests/simple_pipeline.py Normal file
View File

@ -0,0 +1,43 @@
import tempfile
from typing import List
from theflow import Node
from kotaemon.base import BaseComponent
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.llms.completions.openai import AzureOpenAI
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.vectorstores import ChromaVectorStore
class Pipeline(BaseComponent):
vectorstore_path: str = str(tempfile.mkdtemp())
llm: Node[AzureOpenAI] = Node(
default=AzureOpenAI,
default_kwargs={
"openai_api_base": "https://test.openai.azure.com/",
"openai_api_key": "some-key",
"openai_api_version": "2023-03-15-preview",
"deployment_name": "gpt35turbo",
"temperature": 0,
"request_timeout": 60,
},
)
@Node.decorate(depends_on=["vectorstore_path"])
def retrieving_pipeline(self):
vector_store = ChromaVectorStore(self.vectorstore_path)
embedding = AzureOpenAIEmbeddings(
model="text-embedding-ada-002",
deployment="embedding-deployment",
openai_api_base="https://test.openai.azure.com/",
openai_api_key="some-key",
)
return RetrieveDocumentFromVectorStorePipeline(
vector_store=vector_store, embedding=embedding
)
def run_raw(self, text: str) -> str:
matched_texts: List[str] = self.retrieving_pipeline(text)
return self.llm("\n".join(matched_texts)).text[0]

View File

@ -1,66 +1,14 @@
import pytest
from kotaemon.contribs.promptui.config import export_pipeline_to_config from kotaemon.contribs.promptui.config import export_pipeline_to_config
from kotaemon.contribs.promptui.export import export_from_dict
from kotaemon.contribs.promptui.ui import build_from_dict from kotaemon.contribs.promptui.ui import build_from_dict
from .simple_pipeline import Pipeline
@pytest.fixture()
def simple_pipeline_cls(tmp_path):
"""Create a pipeline class that can be used"""
from typing import List
from theflow import Node
from kotaemon.base import BaseComponent
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.llms.completions.openai import AzureOpenAI
from kotaemon.pipelines.retrieving import (
RetrieveDocumentFromVectorStorePipeline,
)
from kotaemon.vectorstores import ChromaVectorStore
class Pipeline(BaseComponent):
vectorstore_path: str = str(tmp_path)
llm: Node[AzureOpenAI] = Node(
default=AzureOpenAI,
default_kwargs={
"openai_api_base": "https://test.openai.azure.com/",
"openai_api_key": "some-key",
"openai_api_version": "2023-03-15-preview",
"deployment_name": "gpt35turbo",
"temperature": 0,
"request_timeout": 60,
},
)
@Node.decorate(depends_on=["vectorstore_path"])
def retrieving_pipeline(self):
vector_store = ChromaVectorStore(self.vectorstore_path)
embedding = AzureOpenAIEmbeddings(
model="text-embedding-ada-002",
deployment="embedding-deployment",
openai_api_base="https://test.openai.azure.com/",
openai_api_key="some-key",
)
return RetrieveDocumentFromVectorStorePipeline(
vector_store=vector_store, embedding=embedding
)
def run_raw(self, text: str) -> str:
matched_texts: List[str] = self.retrieving_pipeline(text)
return self.llm("\n".join(matched_texts)).text[0]
return Pipeline
Pipeline = simple_pipeline_cls
class TestPromptConfig: class TestPromptConfig:
def test_export_prompt_config(self, simple_pipeline_cls): def test_export_prompt_config(self):
"""Test if the prompt config is exported correctly""" """Test if the prompt config is exported correctly"""
pipeline = simple_pipeline_cls() pipeline = Pipeline()
config_dict = export_pipeline_to_config(pipeline) config_dict = export_pipeline_to_config(pipeline)
config = list(config_dict.values())[0] config = list(config_dict.values())[0]
@ -78,9 +26,42 @@ class TestPromptConfig:
class TestPromptUI: class TestPromptUI:
def test_uigeneration(self, simple_pipeline_cls): def test_uigeneration(self):
"""Test if the gradio UI is exposed without any problem""" """Test if the gradio UI is exposed without any problem"""
pipeline = simple_pipeline_cls() pipeline = Pipeline()
config = export_pipeline_to_config(pipeline) config = export_pipeline_to_config(pipeline)
build_from_dict(config) build_from_dict(config)
class TestExport:
def test_export(self, tmp_path):
"""Test if the export functionality works without error"""
from pathlib import Path
import yaml
from theflow.storage import storage
config_path = tmp_path / "config.yaml"
pipeline = Pipeline()
Path(storage.url(pipeline.config.store_result)).mkdir(
parents=True, exist_ok=True
)
config_dict = export_pipeline_to_config(pipeline)
pipeline_name = list(config_dict.keys())[0]
config_dict[pipeline_name]["logs"] = {
"sheet1": {
"inputs": [{"name": "text", "step": ".", "variable": "text"}],
"outputs": [{"name": "answer", "step": "."}],
},
}
with open(config_path, "w") as f:
yaml.safe_dump(config_dict, f)
export_from_dict(
config=str(config_path),
pipeline=pipeline_name,
output_path=str(tmp_path / "exported.xlsx"),
)