Restructure index to allow it to be dynamically created by end-user (#151)

1. Introduce the concept of "collection_name" to docstore and vector store. Each collection can be viewed similarly to a table in a SQL database. It allows better organizing information within this data source.
2. Move the `Index` and `Source` tables from the application scope into the index scope. For each new index created by user, these tables should increase accordingly. So it depends on the index, rather than the app.
3. Make each index responsible for the UI components in the app.
4. Construct the File UI page.
This commit is contained in:
Duc Nguyen (john)
2024-03-07 01:50:47 +07:00
committed by GitHub
parent cc87aaa783
commit 8a90fcfc99
43 changed files with 1658 additions and 812 deletions

View File

@@ -1,33 +1,44 @@
import asyncio
from copy import deepcopy
from typing import Optional
import gradio as gr
from ktem.app import BasePage
from ktem.components import reasonings
from ktem.db.models import Conversation, engine
from sqlmodel import Session, select
from .chat_panel import ChatPanel
from .control import ConversationControl
from .data_source import DataSource
from .events import (
chat_fn,
index_files_from_dir,
index_fn,
is_liked,
load_files,
update_data_source,
)
from .report import ReportIssue
from .upload import DirectoryUpload, FileUpload
class ChatPage(BasePage):
def __init__(self, app):
self._app = app
self._indices_input = []
self.on_building_ui()
def on_building_ui(self):
with gr.Row():
with gr.Column(scale=1):
self.chat_control = ConversationControl(self._app)
self.data_source = DataSource(self._app)
self.file_upload = FileUpload(self._app)
self.dir_upload = DirectoryUpload(self._app)
for index in self._app.index_manager.indices:
index.selector = -1
index_ui = index.get_selector_component_ui()
if not index_ui:
continue
index_ui.unrender()
with gr.Accordion(label=f"{index.name} Index", open=False):
index_ui.render()
gr_index = index_ui.as_gradio_component()
if gr_index:
index.selector = len(self._indices_input)
self._indices_input.append(gr_index)
setattr(self, f"_index_{index.id}", index_ui)
self.report_issue = ReportIssue(self._app)
with gr.Column(scale=6):
self.chat_panel = ChatPanel(self._app)
@@ -36,19 +47,23 @@ class ChatPage(BasePage):
self.info_panel = gr.HTML(elem_id="chat-info-panel")
def on_register_events(self):
self.chat_panel.submit_btn.click(
self.chat_panel.submit_msg,
gr.on(
triggers=[
self.chat_panel.text_input.submit,
self.chat_panel.submit_btn.click,
],
fn=self.chat_panel.submit_msg,
inputs=[self.chat_panel.text_input, self.chat_panel.chatbot],
outputs=[self.chat_panel.text_input, self.chat_panel.chatbot],
show_progress="hidden",
).then(
fn=chat_fn,
fn=self.chat_fn,
inputs=[
self.chat_control.conversation_id,
self.chat_panel.chatbot,
self.data_source.files,
self._app.settings_state,
],
]
+ self._indices_input,
outputs=[
self.chat_panel.text_input,
self.chat_panel.chatbot,
@@ -56,46 +71,17 @@ class ChatPage(BasePage):
],
show_progress="minimal",
).then(
fn=update_data_source,
inputs=[
self.chat_control.conversation_id,
self.data_source.files,
self.chat_panel.chatbot,
],
outputs=None,
)
self.chat_panel.text_input.submit(
self.chat_panel.submit_msg,
inputs=[self.chat_panel.text_input, self.chat_panel.chatbot],
outputs=[self.chat_panel.text_input, self.chat_panel.chatbot],
show_progress="hidden",
).then(
fn=chat_fn,
fn=self.update_data_source,
inputs=[
self.chat_control.conversation_id,
self.chat_panel.chatbot,
self.data_source.files,
self._app.settings_state,
],
outputs=[
self.chat_panel.text_input,
self.chat_panel.chatbot,
self.info_panel,
],
show_progress="minimal",
).then(
fn=update_data_source,
inputs=[
self.chat_control.conversation_id,
self.data_source.files,
self.chat_panel.chatbot,
],
]
+ self._indices_input,
outputs=None,
)
self.chat_panel.chatbot.like(
fn=is_liked,
fn=self.is_liked,
inputs=[self.chat_control.conversation_id],
outputs=None,
)
@@ -107,9 +93,9 @@ class ChatPage(BasePage):
self.chat_control.conversation_id,
self.chat_control.conversation,
self.chat_control.conversation_rn,
self.data_source.files,
self.chat_panel.chatbot,
],
]
+ self._indices_input,
show_progress="hidden",
)
@@ -121,47 +107,112 @@ class ChatPage(BasePage):
self.report_issue.more_detail,
self.chat_control.conversation_id,
self.chat_panel.chatbot,
self.data_source.files,
self._app.settings_state,
self._app.user_id,
],
]
+ self._indices_input,
outputs=None,
)
self.data_source.files.input(
fn=update_data_source,
inputs=[
self.chat_control.conversation_id,
self.data_source.files,
self.chat_panel.chatbot,
],
outputs=None,
)
def update_data_source(self, convo_id, messages, *selecteds):
"""Update the data source"""
if not convo_id:
gr.Warning("No conversation selected")
return
self.file_upload.upload_button.click(
fn=index_fn,
inputs=[
self.file_upload.files,
self.file_upload.reindex,
self.data_source.files,
self._app.settings_state,
],
outputs=[self.file_upload.file_output, self.data_source.files],
)
selecteds_ = {}
for index in self._app.index_manager.indices:
if index.selector != -1:
selecteds_[str(index.id)] = selecteds[index.selector]
self.dir_upload.upload_button.click(
fn=index_files_from_dir,
inputs=[
self.dir_upload.path,
self.dir_upload.reindex,
self.data_source.files,
self._app.settings_state,
],
outputs=[self.dir_upload.file_output, self.data_source.files],
)
with Session(engine) as session:
statement = select(Conversation).where(Conversation.id == convo_id)
result = session.exec(statement).one()
self._app.app.load(
lambda: gr.update(choices=load_files()),
inputs=None,
outputs=[self.data_source.files],
)
data_source = result.data_source
result.data_source = {
"selected": selecteds_,
"messages": messages,
"likes": deepcopy(data_source.get("likes", [])),
}
session.add(result)
session.commit()
def is_liked(self, convo_id, liked: gr.LikeData):
with Session(engine) as session:
statement = select(Conversation).where(Conversation.id == convo_id)
result = session.exec(statement).one()
data_source = deepcopy(result.data_source)
likes = data_source.get("likes", [])
likes.append([liked.index, liked.value, liked.liked])
data_source["likes"] = likes
result.data_source = data_source
session.add(result)
session.commit()
def create_pipeline(self, settings: dict, *selecteds):
"""Create the pipeline from settings
Args:
settings: the settings of the app
selected: the list of file ids that will be served as context. If None, then
consider using all files
Returns:
the pipeline objects
"""
# get retrievers
retrievers = []
for index in self._app.index_manager.indices:
index_selected = []
if index.selector != -1:
index_selected = selecteds[index.selector]
iretrievers = index.get_retriever_pipelines(settings, index_selected)
retrievers += iretrievers
reasoning_mode = settings["reasoning.use"]
reasoning_cls = reasonings[reasoning_mode]
pipeline = reasoning_cls.get_pipeline(settings, retrievers)
return pipeline
async def chat_fn(self, conversation_id, chat_history, settings, *selecteds):
"""Chat function"""
chat_input = chat_history[-1][0]
chat_history = chat_history[:-1]
queue: asyncio.Queue[Optional[dict]] = asyncio.Queue()
# construct the pipeline
pipeline = self.create_pipeline(settings, *selecteds)
pipeline.set_output_queue(queue)
asyncio.create_task(pipeline(chat_input, conversation_id, chat_history))
text, refs = "", ""
len_ref = -1 # for logging purpose
while True:
try:
response = queue.get_nowait()
except Exception:
yield "", chat_history + [(chat_input, text or "Thinking ...")], refs
continue
if response is None:
queue.task_done()
print("Chat completed")
break
if "output" in response:
text += response["output"]
if "evidence" in response:
refs += response["evidence"]
if len(refs) > len_ref:
print(f"Len refs: {len(refs)}")
len_ref = len(refs)
yield "", chat_history + [(chat_input, text)], refs

View File

@@ -166,15 +166,23 @@ class ConversationControl(BasePage):
result = session.exec(statement).one()
id_ = result.id
name = result.name
files = result.data_source.get("files", [])
selected = result.data_source.get("selected", {})
chats = result.data_source.get("messages", [])
except Exception as e:
logger.warning(e)
id_ = ""
name = ""
files = []
selected = {}
chats = []
return id_, id_, name, files, chats
indices = []
for index in self._app.index_manager.indices:
# assume that the index has selector
if index.selector == -1:
continue
indices.append(selected.get(str(index.id), []))
return id_, id_, name, chats, *indices
def rename_conv(self, conversation_id, new_name, user_id):
"""Rename the conversation"""

View File

@@ -1,18 +0,0 @@
import gradio as gr
from ktem.app import BasePage
class DataSource(BasePage):
def __init__(self, app):
self._app = app
self.on_building_ui()
def on_building_ui(self):
with gr.Accordion(label="Data source", open=True):
self.files = gr.Dropdown(
label="Files",
choices=[],
multiselect=True,
container=False,
interactive=True,
)

View File

@@ -1,305 +0,0 @@
import asyncio
import os
import tempfile
from copy import deepcopy
from typing import Optional, Type
import gradio as gr
from ktem.components import llms, reasonings
from ktem.db.models import Conversation, Source, engine
from ktem.indexing.base import BaseIndexing
from sqlmodel import Session, select
from theflow.settings import settings as app_settings
from theflow.utils.modules import import_dotted_string
def create_pipeline(settings: dict, files: Optional[list] = None):
"""Create the pipeline from settings
Args:
settings: the settings of the app
files: the list of file ids that will be served as context. If None, then
consider using all files
Returns:
the pipeline objects
"""
# get retrievers
indexing_cls: BaseIndexing = import_dotted_string(app_settings.KH_INDEX, safe=False)
retrievers = indexing_cls.get_pipeline(settings).get_retrievers(
settings, files=files
)
reasoning_mode = settings["reasoning.use"]
reasoning_cls = reasonings[reasoning_mode]
pipeline = reasoning_cls.get_pipeline(settings, retrievers, files=files)
if settings["reasoning.use"] in ["rewoo", "react"]:
from kotaemon.agents import ReactAgent, RewooAgent
llm = (
llms["gpt4"]
if settings["answer_simple_llm_model"] == "gpt-4"
else llms["gpt35"]
)
tools = []
tools_keys = (
"answer_rewoo_tools"
if settings["reasoning.use"] == "rewoo"
else "answer_react_tools"
)
for tool in settings[tools_keys]:
if tool == "llm":
from kotaemon.agents import LLMTool
tools.append(LLMTool(llm=llm))
# elif tool == "docsearch":
# pass
# filenames = ""
# if files:
# with Session(engine) as session:
# statement = select(Source).where(
# Source.id.in_(files) # type: ignore
# )
# results = session.exec(statement).all()
# filenames = (
# "The file names are: "
# + " ".join([result.name for result in results])
# + ". "
# )
# tool = ComponentTool(
# name="docsearch",
# description=(
# "A vector store that searches for similar and "
# "related content "
# f"in a document. {filenames}"
# "The result is a huge chunk of text related "
# "to your search but can also "
# "contain irrelevant info."
# ),
# component=retrieval_pipeline,
# postprocessor=lambda docs: "\n\n".join(
# [doc.text.replace("\n", " ") for doc in docs]
# ),
# )
# tools.append(tool)
elif tool == "google":
from kotaemon.agents import GoogleSearchTool
tools.append(GoogleSearchTool())
elif tool == "wikipedia":
from kotaemon.agents import WikipediaTool
tools.append(WikipediaTool())
else:
raise NotImplementedError(f"Unknown tool: {tool}")
if settings["reasoning.use"] == "rewoo":
pipeline = RewooAgent(
planner_llm=llm,
solver_llm=llm,
plugins=tools,
)
pipeline.set_run({".use_citation": True})
else:
pipeline = ReactAgent(
llm=llm,
plugins=tools,
)
return pipeline
async def chat_fn(conversation_id, chat_history, files, settings):
"""Chat function"""
chat_input = chat_history[-1][0]
chat_history = chat_history[:-1]
queue: asyncio.Queue[Optional[dict]] = asyncio.Queue()
# construct the pipeline
pipeline = create_pipeline(settings, files)
pipeline.set_output_queue(queue)
asyncio.create_task(pipeline(chat_input, conversation_id, chat_history))
text, refs = "", ""
len_ref = -1 # for logging purpose
while True:
try:
response = queue.get_nowait()
except Exception:
yield "", chat_history + [(chat_input, text or "Thinking ...")], refs
continue
if response is None:
queue.task_done()
print("Chat completed")
break
if "output" in response:
text += response["output"]
if "evidence" in response:
refs += response["evidence"]
if len(refs) > len_ref:
print(f"Len refs: {len(refs)}")
len_ref = len(refs)
yield "", chat_history + [(chat_input, text)], refs
def is_liked(convo_id, liked: gr.LikeData):
with Session(engine) as session:
statement = select(Conversation).where(Conversation.id == convo_id)
result = session.exec(statement).one()
data_source = deepcopy(result.data_source)
likes = data_source.get("likes", [])
likes.append([liked.index, liked.value, liked.liked])
data_source["likes"] = likes
result.data_source = data_source
session.add(result)
session.commit()
def update_data_source(convo_id, selected_files, messages):
"""Update the data source"""
if not convo_id:
gr.Warning("No conversation selected")
return
with Session(engine) as session:
statement = select(Conversation).where(Conversation.id == convo_id)
result = session.exec(statement).one()
data_source = result.data_source
result.data_source = {
"files": selected_files,
"messages": messages,
"likes": deepcopy(data_source.get("likes", [])),
}
session.add(result)
session.commit()
def load_files():
options = []
with Session(engine) as session:
statement = select(Source)
results = session.exec(statement).all()
for result in results:
options.append((result.name, result.id))
return options
def index_fn(files, reindex: bool, selected_files, settings):
"""Upload and index the files
Args:
files: the list of files to be uploaded
reindex: whether to reindex the files
selected_files: the list of files already selected
settings: the settings of the app
"""
gr.Info(f"Start indexing {len(files)} files...")
# get the pipeline
indexing_cls: Type[BaseIndexing] = import_dotted_string(
app_settings.KH_INDEX, safe=False
)
indexing_pipeline = indexing_cls.get_pipeline(settings)
output_nodes, file_ids = indexing_pipeline(files, reindex=reindex)
gr.Info(f"Finish indexing into {len(output_nodes)} chunks")
# download the file
text = "\n\n".join([each.text for each in output_nodes])
handler, file_path = tempfile.mkstemp(suffix=".txt")
with open(file_path, "w") as f:
f.write(text)
os.close(handler)
if isinstance(selected_files, list):
output = selected_files + file_ids
else:
output = file_ids
file_list = load_files()
return (
gr.update(value=file_path, visible=True),
gr.update(value=output, choices=file_list), # unnecessary
)
def index_files_from_dir(folder_path, reindex, selected_files, settings):
"""This should be constructable by users
It means that the users can build their own index.
Build your own index:
- Input:
- Type: based on the type, then there are ranges of. Use can select multiple
panels:
- Panels
- Data sources
- Include patterns
- Exclude patterns
- Indexing functions. Can be a list of indexing functions. Each declared
function is:
- Condition (the source that will go through this indexing function)
- Function (the pipeline that run this)
- Output: artifacts that can be used to -> this is the artifacts that we wish
- Build the UI
- Upload page: fixed standard, based on the type
- Read page: fixed standard, based on the type
- Delete page: fixed standard, based on the type
- Build the index function
- Build the chat function
Step:
1. Decide on the artifacts
2. Implement the transformation from artifacts to UI
"""
if not folder_path:
return
import fnmatch
from pathlib import Path
include_patterns: list[str] = []
exclude_patterns: list[str] = ["*.png", "*.gif", "*/.*"]
if include_patterns and exclude_patterns:
raise ValueError("Cannot have both include and exclude patterns")
# clean up the include patterns
for idx in range(len(include_patterns)):
if include_patterns[idx].startswith("*"):
include_patterns[idx] = str(Path.cwd() / "**" / include_patterns[idx])
else:
include_patterns[idx] = str(Path.cwd() / include_patterns[idx].strip("/"))
# clean up the exclude patterns
for idx in range(len(exclude_patterns)):
if exclude_patterns[idx].startswith("*"):
exclude_patterns[idx] = str(Path.cwd() / "**" / exclude_patterns[idx])
else:
exclude_patterns[idx] = str(Path.cwd() / exclude_patterns[idx].strip("/"))
# get the files
files: list[str] = [str(p) for p in Path(folder_path).glob("**/*.*")]
if include_patterns:
for p in include_patterns:
files = fnmatch.filter(names=files, pat=p)
if exclude_patterns:
for p in exclude_patterns:
files = [f for f in files if not fnmatch.fnmatch(name=f, pat=p)]
return index_fn(files, reindex, selected_files, settings)

View File

@@ -46,10 +46,15 @@ class ReportIssue(BasePage):
more_detail: str,
conv_id: str,
chat_history: list,
files: list,
settings: dict,
user_id: Optional[int],
*selecteds
):
selecteds_ = {}
for index in self._app.index_manager.indices:
if index.selector != -1:
selecteds_[str(index.id)] = selecteds[index.selector]
with Session(engine) as session:
issue = IssueReport(
issues={
@@ -60,7 +65,7 @@ class ReportIssue(BasePage):
chat={
"conv_id": conv_id,
"chat_history": chat_history,
"files": files,
"selecteds": selecteds_,
},
settings=settings,
user=user_id,

View File

@@ -1,79 +0,0 @@
import gradio as gr
from ktem.app import BasePage
class FileUpload(BasePage):
def __init__(self, app):
self._app = app
self._supported_file_types = [
"image",
".pdf",
".txt",
".csv",
".xlsx",
".doc",
".docx",
".pptx",
".html",
".zip",
]
self.on_building_ui()
def on_building_ui(self):
with gr.Accordion(label="File upload", open=False):
gr.Markdown(
f"Supported file types: {', '.join(self._supported_file_types)}",
)
self.files = gr.File(
file_types=self._supported_file_types,
file_count="multiple",
container=False,
height=50,
)
with gr.Accordion("Advanced indexing options", open=False):
with gr.Row():
self.reindex = gr.Checkbox(
value=False, label="Force reindex file", container=False
)
self.upload_button = gr.Button("Upload and Index")
self.file_output = gr.File(
visible=False, label="Output files (debug purpose)"
)
class DirectoryUpload(BasePage):
def __init__(self, app):
self._app = app
self._supported_file_types = [
"image",
".pdf",
".txt",
".csv",
".xlsx",
".doc",
".docx",
".pptx",
".html",
".zip",
]
self.on_building_ui()
def on_building_ui(self):
with gr.Accordion(label="Directory upload", open=False):
gr.Markdown(
f"Supported file types: {', '.join(self._supported_file_types)}",
)
self.path = gr.Textbox(
placeholder="Directory path...", lines=1, max_lines=1, container=False
)
with gr.Accordion("Advanced indexing options", open=False):
with gr.Row():
self.reindex = gr.Checkbox(
value=False, label="Force reindex file", container=False
)
self.upload_button = gr.Button("Upload and Index")
self.file_output = gr.File(
visible=False, label="Output files (debug purpose)"
)

View File

@@ -329,9 +329,17 @@ class SettingsPage(BasePage):
self._components[f"application.{n}"] = obj
def index_tab(self):
for n, si in self._default_settings.index.settings.items():
obj = render_setting_item(si, si.value)
self._components[f"index.{n}"] = obj
# TODO: double check if we need general
# with gr.Tab("General"):
# for n, si in self._default_settings.index.settings.items():
# obj = render_setting_item(si, si.value)
# self._components[f"index.{n}"] = obj
for pn, sig in self._default_settings.index.options.items():
with gr.Tab(f"Index {pn}"):
for n, si in sig.settings.items():
obj = render_setting_item(si, si.value)
self._components[f"index.options.{pn}.{n}"] = obj
def reasoning_tab(self):
with gr.Group():