Migrate the MVP into kotaemon (#108)

- Migrate the MVP into kotaemon. - Preliminary include the pipeline within chatbot interface. - Organize MVP as an application. Todo: - Add an info panel to view the planning of agents -> Fix streaming agents' output. Resolve: #60 Resolve: #61 Resolve: #62
2024-01-10 15:28:09 +07:00
parent 230328c62f
commit 5a9d6f75be
31 changed files with 273 additions and 92 deletions
--- a/knowledgehub/loaders/ocr_loader.py
+++ b/knowledgehub/loaders/ocr_loader.py
@@ -26,11 +26,7 @@ class OCRReader(BaseReader):
        self.ocr_endpoint = endpoint
        self.use_ocr = use_ocr

-    def load_data(
-        self,
-        file_path: Path,
-        **kwargs,
-    ) -> List[Document]:
+    def load_data(self, file_path: Path, **kwargs) -> List[Document]:
        """Load data using OCR reader

        Args:
@@ -41,23 +37,24 @@ class OCRReader(BaseReader):
        Returns:
            List[Document]: list of documents extracted from the PDF file
        """
-        # create input params for the requests
-        content = open(file_path, "rb")
-        files = {"input": content}
-        data = {"job_id": uuid4(), "table_only": not self.use_ocr}
+        file_path = Path(file_path).resolve()
+
+        with file_path.open("rb") as content:
+            files = {"input": content}
+            data = {"job_id": uuid4(), "table_only": not self.use_ocr}
+
+            # call the API from FullOCR endpoint
+            if "response_content" in kwargs:
+                # overriding response content if specified
+                ocr_results = kwargs["response_content"]
+            else:
+                # call original API
+                resp = requests.post(url=self.ocr_endpoint, files=files, data=data)
+                ocr_results = resp.json()["result"]

        debug_path = kwargs.pop("debug_path", None)
        artifact_path = kwargs.pop("artifact_path", None)

-        # call the API from FullOCR endpoint
-        if "response_content" in kwargs:
-            # overriding response content if specified
-            ocr_results = kwargs["response_content"]
-        else:
-            # call original API
-            resp = requests.post(url=self.ocr_endpoint, files=files, data=data)
-            ocr_results = resp.json()["result"]
-
        # read PDF through normal reader (unstructured)
        pdf_page_items = read_pdf_unstructured(file_path)
        # merge PDF text output with OCR output
@@ -77,6 +74,9 @@ class OCRReader(BaseReader):
                    "type": "table",
                    "page_label": page_id + 1,
                    "source": file_path.name,
+                    "file_path": str(file_path),
+                    "file_name": file_path.name,
+                    "filename": str(file_path),
                },
                metadata_template="",
                metadata_seperator="",
@@ -91,6 +91,9 @@ class OCRReader(BaseReader):
                    metadata={
                        "page_label": page_id + 1,
                        "source": file_path.name,
+                        "file_path": str(file_path),
+                        "file_name": file_path.name,
+                        "filename": str(file_path),
                    },
                )
                for page_id, non_table_text in texts
--- a/knowledgehub/loaders/unstructured_loader.py
+++ b/knowledgehub/loaders/unstructured_loader.py
@@ -74,9 +74,10 @@ class UnstructuredReader(BaseReader):
        """ Process elements """
        docs = []
        file_name = Path(file).name
+        file_path = str(Path(file).resolve())
        if split_documents:
            for node in elements:
-                metadata = {"file_name": file_name}
+                metadata = {"file_name": file_name, "file_path": file_path}
                if hasattr(node, "metadata"):
                    """Load metadata fields"""
                    for field, val in vars(node.metadata).items():
@@ -99,7 +100,7 @@ class UnstructuredReader(BaseReader):

        else:
            text_chunks = [" ".join(str(el).split()) for el in elements]
-            metadata = {"file_name": file_name}
+            metadata = {"file_name": file_name, "file_path": file_path}

            if additional_metadata is not None:
                metadata.update(additional_metadata)