Restructure index to allow it to be dynamically created by end-user (#151)

1. Introduce the concept of "collection_name" to docstore and vector store. Each collection can be viewed similarly to a table in a SQL database. It allows better organizing information within this data source. 2. Move the `Index` and `Source` tables from the application scope into the index scope. For each new index created by user, these tables should increase accordingly. So it depends on the index, rather than the app. 3. Make each index responsible for the UI components in the app. 4. Construct the File UI page.
2024-03-07 01:50:47 +07:00
parent cc87aaa783
commit 8a90fcfc99
43 changed files with 1658 additions and 812 deletions
--- a/libs/ktem/ktem/pages/chat/init.py
+++ b/libs/ktem/ktem/pages/chat/init.py
@@ -1,33 +1,44 @@
+import asyncio
+from copy import deepcopy
+from typing import Optional
+
 import gradio as gr
 from ktem.app import BasePage
+from ktem.components import reasonings
+from ktem.db.models import Conversation, engine
+from sqlmodel import Session, select

 from .chat_panel import ChatPanel
 from .control import ConversationControl
-from .data_source import DataSource
-from .events import (
-    chat_fn,
-    index_files_from_dir,
-    index_fn,
-    is_liked,
-    load_files,
-    update_data_source,
-)
 from .report import ReportIssue
-from .upload import DirectoryUpload, FileUpload


 class ChatPage(BasePage):
    def __init__(self, app):
        self._app = app
+        self._indices_input = []
        self.on_building_ui()

    def on_building_ui(self):
        with gr.Row():
            with gr.Column(scale=1):
                self.chat_control = ConversationControl(self._app)
-                self.data_source = DataSource(self._app)
-                self.file_upload = FileUpload(self._app)
-                self.dir_upload = DirectoryUpload(self._app)
+
+                for index in self._app.index_manager.indices:
+                    index.selector = -1
+                    index_ui = index.get_selector_component_ui()
+                    if not index_ui:
+                        continue
+
+                    index_ui.unrender()
+                    with gr.Accordion(label=f"{index.name} Index", open=False):
+                        index_ui.render()
+                        gr_index = index_ui.as_gradio_component()
+                        if gr_index:
+                            index.selector = len(self._indices_input)
+                            self._indices_input.append(gr_index)
+                        setattr(self, f"_index_{index.id}", index_ui)
+
                self.report_issue = ReportIssue(self._app)
            with gr.Column(scale=6):
                self.chat_panel = ChatPanel(self._app)
@@ -36,19 +47,23 @@ class ChatPage(BasePage):
                    self.info_panel = gr.HTML(elem_id="chat-info-panel")

    def on_register_events(self):
-        self.chat_panel.submit_btn.click(
-            self.chat_panel.submit_msg,
+        gr.on(
+            triggers=[
+                self.chat_panel.text_input.submit,
+                self.chat_panel.submit_btn.click,
+            ],
+            fn=self.chat_panel.submit_msg,
            inputs=[self.chat_panel.text_input, self.chat_panel.chatbot],
            outputs=[self.chat_panel.text_input, self.chat_panel.chatbot],
            show_progress="hidden",
        ).then(
-            fn=chat_fn,
+            fn=self.chat_fn,
            inputs=[
                self.chat_control.conversation_id,
                self.chat_panel.chatbot,
-                self.data_source.files,
                self._app.settings_state,
-            ],
+            ]
+            + self._indices_input,
            outputs=[
                self.chat_panel.text_input,
                self.chat_panel.chatbot,
@@ -56,46 +71,17 @@ class ChatPage(BasePage):
            ],
            show_progress="minimal",
        ).then(
-            fn=update_data_source,
-            inputs=[
-                self.chat_control.conversation_id,
-                self.data_source.files,
-                self.chat_panel.chatbot,
-            ],
-            outputs=None,
-        )
-
-        self.chat_panel.text_input.submit(
-            self.chat_panel.submit_msg,
-            inputs=[self.chat_panel.text_input, self.chat_panel.chatbot],
-            outputs=[self.chat_panel.text_input, self.chat_panel.chatbot],
-            show_progress="hidden",
-        ).then(
-            fn=chat_fn,
+            fn=self.update_data_source,
            inputs=[
                self.chat_control.conversation_id,
                self.chat_panel.chatbot,
-                self.data_source.files,
-                self._app.settings_state,
-            ],
-            outputs=[
-                self.chat_panel.text_input,
-                self.chat_panel.chatbot,
-                self.info_panel,
-            ],
-            show_progress="minimal",
-        ).then(
-            fn=update_data_source,
-            inputs=[
-                self.chat_control.conversation_id,
-                self.data_source.files,
-                self.chat_panel.chatbot,
-            ],
+            ]
+            + self._indices_input,
            outputs=None,
        )

        self.chat_panel.chatbot.like(
-            fn=is_liked,
+            fn=self.is_liked,
            inputs=[self.chat_control.conversation_id],
            outputs=None,
        )
@@ -107,9 +93,9 @@ class ChatPage(BasePage):
                self.chat_control.conversation_id,
                self.chat_control.conversation,
                self.chat_control.conversation_rn,
-                self.data_source.files,
                self.chat_panel.chatbot,
-            ],
+            ]
+            + self._indices_input,
            show_progress="hidden",
        )

@@ -121,47 +107,112 @@ class ChatPage(BasePage):
                self.report_issue.more_detail,
                self.chat_control.conversation_id,
                self.chat_panel.chatbot,
-                self.data_source.files,
                self._app.settings_state,
                self._app.user_id,
-            ],
+            ]
+            + self._indices_input,
            outputs=None,
        )

-        self.data_source.files.input(
-            fn=update_data_source,
-            inputs=[
-                self.chat_control.conversation_id,
-                self.data_source.files,
-                self.chat_panel.chatbot,
-            ],
-            outputs=None,
-        )
+    def update_data_source(self, convo_id, messages, *selecteds):
+        """Update the data source"""
+        if not convo_id:
+            gr.Warning("No conversation selected")
+            return

-        self.file_upload.upload_button.click(
-            fn=index_fn,
-            inputs=[
-                self.file_upload.files,
-                self.file_upload.reindex,
-                self.data_source.files,
-                self._app.settings_state,
-            ],
-            outputs=[self.file_upload.file_output, self.data_source.files],
-        )
+        selecteds_ = {}
+        for index in self._app.index_manager.indices:
+            if index.selector != -1:
+                selecteds_[str(index.id)] = selecteds[index.selector]

-        self.dir_upload.upload_button.click(
-            fn=index_files_from_dir,
-            inputs=[
-                self.dir_upload.path,
-                self.dir_upload.reindex,
-                self.data_source.files,
-                self._app.settings_state,
-            ],
-            outputs=[self.dir_upload.file_output, self.data_source.files],
-        )
+        with Session(engine) as session:
+            statement = select(Conversation).where(Conversation.id == convo_id)
+            result = session.exec(statement).one()

-        self._app.app.load(
-            lambda: gr.update(choices=load_files()),
-            inputs=None,
-            outputs=[self.data_source.files],
-        )
+            data_source = result.data_source
+            result.data_source = {
+                "selected": selecteds_,
+                "messages": messages,
+                "likes": deepcopy(data_source.get("likes", [])),
+            }
+            session.add(result)
+            session.commit()
+
+    def is_liked(self, convo_id, liked: gr.LikeData):
+        with Session(engine) as session:
+            statement = select(Conversation).where(Conversation.id == convo_id)
+            result = session.exec(statement).one()
+
+            data_source = deepcopy(result.data_source)
+            likes = data_source.get("likes", [])
+            likes.append([liked.index, liked.value, liked.liked])
+            data_source["likes"] = likes
+
+            result.data_source = data_source
+            session.add(result)
+            session.commit()
+
+    def create_pipeline(self, settings: dict, *selecteds):
+        """Create the pipeline from settings
+
+        Args:
+            settings: the settings of the app
+            selected: the list of file ids that will be served as context. If None, then
+                consider using all files
+
+        Returns:
+            the pipeline objects
+        """
+        # get retrievers
+        retrievers = []
+        for index in self._app.index_manager.indices:
+            index_selected = []
+            if index.selector != -1:
+                index_selected = selecteds[index.selector]
+            iretrievers = index.get_retriever_pipelines(settings, index_selected)
+            retrievers += iretrievers
+
+        reasoning_mode = settings["reasoning.use"]
+        reasoning_cls = reasonings[reasoning_mode]
+        pipeline = reasoning_cls.get_pipeline(settings, retrievers)
+
+        return pipeline
+
+    async def chat_fn(self, conversation_id, chat_history, settings, *selecteds):
+        """Chat function"""
+        chat_input = chat_history[-1][0]
+        chat_history = chat_history[:-1]
+
+        queue: asyncio.Queue[Optional[dict]] = asyncio.Queue()
+
+        # construct the pipeline
+        pipeline = self.create_pipeline(settings, *selecteds)
+        pipeline.set_output_queue(queue)
+
+        asyncio.create_task(pipeline(chat_input, conversation_id, chat_history))
+        text, refs = "", ""
+
+        len_ref = -1  # for logging purpose
+
+        while True:
+            try:
+                response = queue.get_nowait()
+            except Exception:
+                yield "", chat_history + [(chat_input, text or "Thinking ...")], refs
+                continue
+
+            if response is None:
+                queue.task_done()
+                print("Chat completed")
+                break
+
+            if "output" in response:
+                text += response["output"]
+            if "evidence" in response:
+                refs += response["evidence"]
+
+            if len(refs) > len_ref:
+                print(f"Len refs: {len(refs)}")
+                len_ref = len(refs)
+
+        yield "", chat_history + [(chat_input, text)], refs
--- a/libs/ktem/ktem/pages/chat/control.py
+++ b/libs/ktem/ktem/pages/chat/control.py
@@ -166,15 +166,23 @@ class ConversationControl(BasePage):
                result = session.exec(statement).one()
                id_ = result.id
                name = result.name
-                files = result.data_source.get("files", [])
+                selected = result.data_source.get("selected", {})
                chats = result.data_source.get("messages", [])
            except Exception as e:
                logger.warning(e)
                id_ = ""
                name = ""
-                files = []
+                selected = {}
                chats = []
-        return id_, id_, name, files, chats
+
+        indices = []
+        for index in self._app.index_manager.indices:
+            # assume that the index has selector
+            if index.selector == -1:
+                continue
+            indices.append(selected.get(str(index.id), []))
+
+        return id_, id_, name, chats, *indices

    def rename_conv(self, conversation_id, new_name, user_id):
        """Rename the conversation"""
--- a/libs/ktem/ktem/pages/chat/data_source.py
+++ b/libs/ktem/ktem/pages/chat/data_source.py
@@ -1,18 +0,0 @@
-import gradio as gr
-from ktem.app import BasePage
-
-
-class DataSource(BasePage):
-    def __init__(self, app):
-        self._app = app
-        self.on_building_ui()
-
-    def on_building_ui(self):
-        with gr.Accordion(label="Data source", open=True):
-            self.files = gr.Dropdown(
-                label="Files",
-                choices=[],
-                multiselect=True,
-                container=False,
-                interactive=True,
-            )
--- a/libs/ktem/ktem/pages/chat/events.py
+++ b/libs/ktem/ktem/pages/chat/events.py
@@ -1,305 +0,0 @@
-import asyncio
-import os
-import tempfile
-from copy import deepcopy
-from typing import Optional, Type
-
-import gradio as gr
-from ktem.components import llms, reasonings
-from ktem.db.models import Conversation, Source, engine
-from ktem.indexing.base import BaseIndexing
-from sqlmodel import Session, select
-from theflow.settings import settings as app_settings
-from theflow.utils.modules import import_dotted_string
-
-
-def create_pipeline(settings: dict, files: Optional[list] = None):
-    """Create the pipeline from settings
-
-    Args:
-        settings: the settings of the app
-        files: the list of file ids that will be served as context. If None, then
-            consider using all files
-
-    Returns:
-        the pipeline objects
-    """
-
-    # get retrievers
-    indexing_cls: BaseIndexing = import_dotted_string(app_settings.KH_INDEX, safe=False)
-    retrievers = indexing_cls.get_pipeline(settings).get_retrievers(
-        settings, files=files
-    )
-
-    reasoning_mode = settings["reasoning.use"]
-    reasoning_cls = reasonings[reasoning_mode]
-    pipeline = reasoning_cls.get_pipeline(settings, retrievers, files=files)
-
-    if settings["reasoning.use"] in ["rewoo", "react"]:
-        from kotaemon.agents import ReactAgent, RewooAgent
-
-        llm = (
-            llms["gpt4"]
-            if settings["answer_simple_llm_model"] == "gpt-4"
-            else llms["gpt35"]
-        )
-        tools = []
-        tools_keys = (
-            "answer_rewoo_tools"
-            if settings["reasoning.use"] == "rewoo"
-            else "answer_react_tools"
-        )
-        for tool in settings[tools_keys]:
-            if tool == "llm":
-                from kotaemon.agents import LLMTool
-
-                tools.append(LLMTool(llm=llm))
-            # elif tool == "docsearch":
-            #     pass
-
-            #     filenames = ""
-            #     if files:
-            #         with Session(engine) as session:
-            #             statement = select(Source).where(
-            #                 Source.id.in_(files)  # type: ignore
-            #             )
-            #             results = session.exec(statement).all()
-            #             filenames = (
-            #                 "The file names are: "
-            #                 + "  ".join([result.name for result in results])
-            #                 + ". "
-            #             )
-
-            #     tool = ComponentTool(
-            #         name="docsearch",
-            #         description=(
-            #             "A vector store that searches for similar and "
-            #             "related content "
-            #             f"in a document. {filenames}"
-            #             "The result is a huge chunk of text related "
-            #             "to your search but can also "
-            #             "contain irrelevant info."
-            #         ),
-            #         component=retrieval_pipeline,
-            #         postprocessor=lambda docs: "\n\n".join(
-            #             [doc.text.replace("\n", " ") for doc in docs]
-            #         ),
-            #     )
-            #     tools.append(tool)
-            elif tool == "google":
-                from kotaemon.agents import GoogleSearchTool
-
-                tools.append(GoogleSearchTool())
-            elif tool == "wikipedia":
-                from kotaemon.agents import WikipediaTool
-
-                tools.append(WikipediaTool())
-            else:
-                raise NotImplementedError(f"Unknown tool: {tool}")
-
-        if settings["reasoning.use"] == "rewoo":
-            pipeline = RewooAgent(
-                planner_llm=llm,
-                solver_llm=llm,
-                plugins=tools,
-            )
-            pipeline.set_run({".use_citation": True})
-        else:
-            pipeline = ReactAgent(
-                llm=llm,
-                plugins=tools,
-            )
-
-    return pipeline
-
-
-async def chat_fn(conversation_id, chat_history, files, settings):
-    """Chat function"""
-    chat_input = chat_history[-1][0]
-    chat_history = chat_history[:-1]
-
-    queue: asyncio.Queue[Optional[dict]] = asyncio.Queue()
-
-    # construct the pipeline
-    pipeline = create_pipeline(settings, files)
-    pipeline.set_output_queue(queue)
-
-    asyncio.create_task(pipeline(chat_input, conversation_id, chat_history))
-    text, refs = "", ""
-
-    len_ref = -1  # for logging purpose
-
-    while True:
-        try:
-            response = queue.get_nowait()
-        except Exception:
-            yield "", chat_history + [(chat_input, text or "Thinking ...")], refs
-            continue
-
-        if response is None:
-            queue.task_done()
-            print("Chat completed")
-            break
-
-        if "output" in response:
-            text += response["output"]
-        if "evidence" in response:
-            refs += response["evidence"]
-
-        if len(refs) > len_ref:
-            print(f"Len refs: {len(refs)}")
-            len_ref = len(refs)
-
-    yield "", chat_history + [(chat_input, text)], refs
-
-
-def is_liked(convo_id, liked: gr.LikeData):
-    with Session(engine) as session:
-        statement = select(Conversation).where(Conversation.id == convo_id)
-        result = session.exec(statement).one()
-
-        data_source = deepcopy(result.data_source)
-        likes = data_source.get("likes", [])
-        likes.append([liked.index, liked.value, liked.liked])
-        data_source["likes"] = likes
-
-        result.data_source = data_source
-        session.add(result)
-        session.commit()
-
-
-def update_data_source(convo_id, selected_files, messages):
-    """Update the data source"""
-    if not convo_id:
-        gr.Warning("No conversation selected")
-        return
-
-    with Session(engine) as session:
-        statement = select(Conversation).where(Conversation.id == convo_id)
-        result = session.exec(statement).one()
-
-        data_source = result.data_source
-        result.data_source = {
-            "files": selected_files,
-            "messages": messages,
-            "likes": deepcopy(data_source.get("likes", [])),
-        }
-        session.add(result)
-        session.commit()
-
-
-def load_files():
-    options = []
-    with Session(engine) as session:
-        statement = select(Source)
-        results = session.exec(statement).all()
-        for result in results:
-            options.append((result.name, result.id))
-
-    return options
-
-
-def index_fn(files, reindex: bool, selected_files, settings):
-    """Upload and index the files
-
-    Args:
-        files: the list of files to be uploaded
-        reindex: whether to reindex the files
-        selected_files: the list of files already selected
-        settings: the settings of the app
-    """
-    gr.Info(f"Start indexing {len(files)} files...")
-
-    # get the pipeline
-    indexing_cls: Type[BaseIndexing] = import_dotted_string(
-        app_settings.KH_INDEX, safe=False
-    )
-    indexing_pipeline = indexing_cls.get_pipeline(settings)
-
-    output_nodes, file_ids = indexing_pipeline(files, reindex=reindex)
-    gr.Info(f"Finish indexing into {len(output_nodes)} chunks")
-
-    # download the file
-    text = "\n\n".join([each.text for each in output_nodes])
-    handler, file_path = tempfile.mkstemp(suffix=".txt")
-    with open(file_path, "w") as f:
-        f.write(text)
-    os.close(handler)
-
-    if isinstance(selected_files, list):
-        output = selected_files + file_ids
-    else:
-        output = file_ids
-
-    file_list = load_files()
-
-    return (
-        gr.update(value=file_path, visible=True),
-        gr.update(value=output, choices=file_list),  # unnecessary
-    )
-
-
-def index_files_from_dir(folder_path, reindex, selected_files, settings):
-    """This should be constructable by users
-
-    It means that the users can build their own index.
-    Build your own index:
-        - Input:
-            - Type: based on the type, then there are ranges of. Use can select multiple
-            panels:
-                - Panels
-                - Data sources
-                - Include patterns
-                - Exclude patterns
-            - Indexing functions. Can be a list of indexing functions. Each declared
-            function is:
-                - Condition (the source that will go through this indexing function)
-                - Function (the pipeline that run this)
-        - Output: artifacts that can be used to -> this is the artifacts that we wish
-            - Build the UI
-                - Upload page: fixed standard, based on the type
-                - Read page: fixed standard, based on the type
-                - Delete page: fixed standard, based on the type
-            - Build the index function
-            - Build the chat function
-
-    Step:
-        1. Decide on the artifacts
-        2. Implement the transformation from artifacts to UI
-    """
-    if not folder_path:
-        return
-
-    import fnmatch
-    from pathlib import Path
-
-    include_patterns: list[str] = []
-    exclude_patterns: list[str] = ["*.png", "*.gif", "*/.*"]
-    if include_patterns and exclude_patterns:
-        raise ValueError("Cannot have both include and exclude patterns")
-
-    # clean up the include patterns
-    for idx in range(len(include_patterns)):
-        if include_patterns[idx].startswith("*"):
-            include_patterns[idx] = str(Path.cwd() / "**" / include_patterns[idx])
-        else:
-            include_patterns[idx] = str(Path.cwd() / include_patterns[idx].strip("/"))
-
-    # clean up the exclude patterns
-    for idx in range(len(exclude_patterns)):
-        if exclude_patterns[idx].startswith("*"):
-            exclude_patterns[idx] = str(Path.cwd() / "**" / exclude_patterns[idx])
-        else:
-            exclude_patterns[idx] = str(Path.cwd() / exclude_patterns[idx].strip("/"))
-
-    # get the files
-    files: list[str] = [str(p) for p in Path(folder_path).glob("**/*.*")]
-    if include_patterns:
-        for p in include_patterns:
-            files = fnmatch.filter(names=files, pat=p)
-
-    if exclude_patterns:
-        for p in exclude_patterns:
-            files = [f for f in files if not fnmatch.fnmatch(name=f, pat=p)]
-
-    return index_fn(files, reindex, selected_files, settings)
--- a/libs/ktem/ktem/pages/chat/report.py
+++ b/libs/ktem/ktem/pages/chat/report.py
@@ -46,10 +46,15 @@ class ReportIssue(BasePage):
        more_detail: str,
        conv_id: str,
        chat_history: list,
-        files: list,
        settings: dict,
        user_id: Optional[int],
+        *selecteds
    ):
+        selecteds_ = {}
+        for index in self._app.index_manager.indices:
+            if index.selector != -1:
+                selecteds_[str(index.id)] = selecteds[index.selector]
+
        with Session(engine) as session:
            issue = IssueReport(
                issues={
@@ -60,7 +65,7 @@ class ReportIssue(BasePage):
                chat={
                    "conv_id": conv_id,
                    "chat_history": chat_history,
-                    "files": files,
+                    "selecteds": selecteds_,
                },
                settings=settings,
                user=user_id,
--- a/libs/ktem/ktem/pages/chat/upload.py
+++ b/libs/ktem/ktem/pages/chat/upload.py
@@ -1,79 +0,0 @@
-import gradio as gr
-from ktem.app import BasePage
-
-
-class FileUpload(BasePage):
-    def __init__(self, app):
-        self._app = app
-        self._supported_file_types = [
-            "image",
-            ".pdf",
-            ".txt",
-            ".csv",
-            ".xlsx",
-            ".doc",
-            ".docx",
-            ".pptx",
-            ".html",
-            ".zip",
-        ]
-        self.on_building_ui()
-
-    def on_building_ui(self):
-        with gr.Accordion(label="File upload", open=False):
-            gr.Markdown(
-                f"Supported file types: {', '.join(self._supported_file_types)}",
-            )
-            self.files = gr.File(
-                file_types=self._supported_file_types,
-                file_count="multiple",
-                container=False,
-                height=50,
-            )
-            with gr.Accordion("Advanced indexing options", open=False):
-                with gr.Row():
-                    self.reindex = gr.Checkbox(
-                        value=False, label="Force reindex file", container=False
-                    )
-
-            self.upload_button = gr.Button("Upload and Index")
-            self.file_output = gr.File(
-                visible=False, label="Output files (debug purpose)"
-            )
-
-
-class DirectoryUpload(BasePage):
-    def __init__(self, app):
-        self._app = app
-        self._supported_file_types = [
-            "image",
-            ".pdf",
-            ".txt",
-            ".csv",
-            ".xlsx",
-            ".doc",
-            ".docx",
-            ".pptx",
-            ".html",
-            ".zip",
-        ]
-        self.on_building_ui()
-
-    def on_building_ui(self):
-        with gr.Accordion(label="Directory upload", open=False):
-            gr.Markdown(
-                f"Supported file types: {', '.join(self._supported_file_types)}",
-            )
-            self.path = gr.Textbox(
-                placeholder="Directory path...", lines=1, max_lines=1, container=False
-            )
-            with gr.Accordion("Advanced indexing options", open=False):
-                with gr.Row():
-                    self.reindex = gr.Checkbox(
-                        value=False, label="Force reindex file", container=False
-                    )
-
-            self.upload_button = gr.Button("Upload and Index")
-            self.file_output = gr.File(
-                visible=False, label="Output files (debug purpose)"
-            )
--- a/libs/ktem/ktem/pages/settings.py
+++ b/libs/ktem/ktem/pages/settings.py
@@ -329,9 +329,17 @@ class SettingsPage(BasePage):
            self._components[f"application.{n}"] = obj

    def index_tab(self):
-        for n, si in self._default_settings.index.settings.items():
-            obj = render_setting_item(si, si.value)
-            self._components[f"index.{n}"] = obj
+        # TODO: double check if we need general
+        # with gr.Tab("General"):
+        #     for n, si in self._default_settings.index.settings.items():
+        #         obj = render_setting_item(si, si.value)
+        #         self._components[f"index.{n}"] = obj
+
+        for pn, sig in self._default_settings.index.options.items():
+            with gr.Tab(f"Index {pn}"):
+                for n, si in sig.settings.items():
+                    obj = render_setting_item(si, si.value)
+                    self._components[f"index.options.{pn}.{n}"] = obj

    def reasoning_tab(self):
        with gr.Group():