[AUR-408] Export logs to Excel (#23)

This CL implements:

- The logic to export log to Excel.
- Route the export logic in the UI.
- Demonstrate this functionality in `./examples/promptui` project.
This commit is contained in:
Nguyen Trung Duc (john)
2023-09-25 17:20:03 +07:00
committed by GitHub
parent 08b6e5d3fb
commit 4f189dc931
5 changed files with 265 additions and 64 deletions

View File

@@ -1 +1,138 @@
"""Export logs into Excel file"""
import os
import pickle
from pathlib import Path
from typing import Any, Dict, List, Type, Union
import pandas as pd
import yaml
from theflow.storage import storage
from theflow.utils.modules import import_dotted_string
from kotaemon.base import BaseComponent
def from_log_to_dict(pipeline_cls: Type[BaseComponent], log_config: dict) -> dict:
"""Export the log to panda dataframes
Args:
pipeline_cls (Type[BaseComponent]): Pipeline class
log_config (dict): Log config
Returns:
dataframe
"""
# get the directory
pipeline_log_path = storage.url(pipeline_cls().config.store_result)
dirs = list(sorted([f.path for f in os.scandir(pipeline_log_path) if f.is_dir()]))
ids = []
params: Dict[str, List[Any]] = {}
inputs: Dict[str, List[Any]] = {}
outputs: Dict[str, List[Any]] = {}
for idx, each_dir in enumerate(dirs):
ids.append(str(Path(each_dir).name))
# get the params
params_file = os.path.join(each_dir, "params.pkl")
if os.path.exists(params_file):
with open(params_file, "rb") as f:
each_params = pickle.load(f)
for key, value in each_params.items():
if key not in params:
params[key] = [None] * len(dirs)
params[key][idx] = value
progress_file = os.path.join(each_dir, "progress.pkl")
if os.path.exists(progress_file):
with open(progress_file, "rb") as f:
progress = pickle.load(f)
# get the inputs
for each_input in log_config["inputs"]:
name = each_input["name"]
step = each_input["step"]
if name not in inputs:
inputs[name] = [None] * len(dirs)
variable = each_input.get("variable", "")
if variable:
inputs[name][idx] = progress[step]["input"]["kwargs"][variable]
else:
inputs[name][idx] = progress[step]["input"]
# get the outputs
for each_output in log_config["outputs"]:
name = each_output["name"]
step = each_output["step"]
if name not in outputs:
outputs[name] = [None] * len(dirs)
outputs[name][idx] = progress[step]["output"]
if each_output.get("item", ""):
outputs[name][idx] = outputs[name][each_output["item"]]
return {"ids": ids, **params, **inputs, **outputs}
def export(config: dict, pipeline_def, output_path):
"""Export from config to Excel file"""
pipeline_name = f"{pipeline_def.__module__}.{pipeline_def.__name__}"
# export to Excel
if not config.get("logs", {}):
raise ValueError(f"Pipeline {pipeline_name} has no logs to export")
pds: Dict[str, pd.DataFrame] = {}
for log_name, log_def in config["logs"].items():
pds[log_name] = pd.DataFrame(from_log_to_dict(pipeline_def, log_def))
# from the list of pds, export to Excel to output_path
with pd.ExcelWriter(output_path, engine="openpyxl") as writer: # type: ignore
for log_name, df in pds.items():
df.to_excel(writer, sheet_name=log_name)
def export_from_dict(
config: Union[str, dict],
pipeline: Union[str, Type[BaseComponent]],
output_path: str,
):
"""CLI to export the logs of a pipeline into Excel file
Args:
config_path (str): Path to the config file
pipeline_name (str): Name of the pipeline
output_path (str): Path to the output Excel file
"""
# get the pipeline class and the relevant config dict
config_dict: dict
if isinstance(config, str):
with open(config) as f:
config_dict = yaml.safe_load(f)
elif isinstance(config, dict):
config_dict = config
else:
raise TypeError(f"`config` must be str or dict, not {type(config)}")
pipeline_name: str
pipeline_cls: Type[BaseComponent]
pipeline_config: dict
if isinstance(pipeline, str):
if pipeline not in config_dict:
raise ValueError(f"Pipeline {pipeline} not found in config file")
pipeline_name = pipeline
pipeline_cls = import_dotted_string(pipeline, safe=False)
pipeline_config = config_dict[pipeline]
elif isinstance(pipeline, type) and issubclass(pipeline, BaseComponent):
pipeline_name = f"{pipeline.__module__}.{pipeline.__name__}"
if pipeline_name not in config_dict:
raise ValueError(f"Pipeline {pipeline_name} not found in config file")
pipeline_cls = pipeline
pipeline_config = config_dict[pipeline_name]
else:
raise TypeError(
f"`pipeline` must be str or subclass of BaseComponent, not {type(pipeline)}"
)
export(pipeline_config, pipeline_cls, output_path)

View File

@@ -1,13 +1,20 @@
import pickle
from datetime import datetime
from pathlib import Path
from typing import Union
import gradio as gr
import yaml
from theflow.storage import storage
from theflow.utils.modules import import_dotted_string
from kotaemon.contribs.promptui.base import COMPONENTS_CLASS, SUPPORTED_COMPONENTS
from kotaemon.contribs.promptui.export import export
USAGE_INSTRUCTION = """In case of errors, you can:
- PromptUI instruction:
https://github.com/Cinnamon/kotaemon/wiki/Utilities#prompt-engineering-ui
- Create bug fix and make PR at: https://github.com/Cinnamon/kotaemon
- Ping any of @john @tadashi @ian @jacky in Slack channel #llm-productization"""
@@ -73,6 +80,8 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
outputs.append(component)
exported_file = gr.File(label="Output file", show_label=True)
temp = gr.Tab
with gr.Blocks(analytics_enabled=False, title="Welcome to PromptUI") as demo:
with gr.Accordion(label="Usage", open=False):
@@ -80,8 +89,10 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
with gr.Row():
run_btn = gr.Button("Run")
run_btn.click(func_run, inputs=inputs + params, outputs=outputs)
export_btn = gr.Button("Export")
export_btn.click(func_export, inputs=None, outputs=None)
export_btn = gr.Button(
"Export (Result will be in Exported file next to Output)"
)
export_btn.click(func_export, inputs=None, outputs=exported_file)
with gr.Row():
with gr.Column():
with temp("Inputs"):
@@ -91,8 +102,11 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
for component in params:
component.render()
with gr.Column():
for component in outputs:
component.render()
with temp("Outputs"):
for component in outputs:
component.render()
with temp("Exported file"):
exported_file.render()
return demo
@@ -103,6 +117,10 @@ def build_pipeline_ui(config: dict, pipeline_def):
params_name = list(config.get("params", {}).keys())
outputs_def = config.get("outputs", [])
output_dir: Path = Path(storage.url(pipeline_def().config.store_result))
exported_dir = output_dir.parent / "exported"
exported_dir.mkdir(parents=True, exist_ok=True)
def run_func(*args):
inputs = {
name: value for name, value in zip(inputs_name, args[: len(inputs_name)])
@@ -113,6 +131,13 @@ def build_pipeline_ui(config: dict, pipeline_def):
pipeline = pipeline_def()
pipeline.set(params)
pipeline(**inputs)
with storage.open(
storage.url(
pipeline.config.store_result, pipeline.last_run.id(), "params.pkl"
),
"wb",
) as f:
pickle.dump(params, f)
if outputs_def:
outputs = []
for output_def in outputs_def:
@@ -122,8 +147,20 @@ def build_pipeline_ui(config: dict, pipeline_def):
outputs.append(output)
return outputs
# TODO: export_func is None for now
return construct_ui(config, run_func, None)
def export_func():
name = (
f"{pipeline_def.__module__}.{pipeline_def.__name__}_{datetime.now()}.xlsx"
)
path = str(exported_dir / name)
gr.Info(f"Begin exporting {name}...")
try:
export(config=config, pipeline_def=pipeline_def, output_path=path)
except Exception as e:
raise gr.Error(f"Failed to export. Please contact project's AIR: {e}")
gr.Info(f"Exported {name}. Please go to the `Exported file` tab to download")
return path
return construct_ui(config, run_func, export_func)
def build_from_dict(config: Union[str, dict]):
@@ -148,4 +185,6 @@ def build_from_dict(config: Union[str, dict]):
else:
demo = gr.TabbedInterface(demos, list(config_dict.keys()))
demo.queue()
return demo