Improve kotaemon based on insights from projects (#147)

- Include static files in the package. - More reliable information panel. Faster & not breaking randomly. - Add directory upload. - Enable zip file to upload. - Allow setting endpoint for the OCR reader using environment variable.
2024-02-28 22:18:29 +07:00
parent e1cf970a3d
commit 033e7e05cc
18 changed files with 618 additions and 56 deletions
--- a/libs/kotaemon/kotaemon/indices/ingests/files.py
+++ b/libs/kotaemon/kotaemon/indices/ingests/files.py
@@ -67,7 +67,7 @@ class DocumentIngestor(BaseComponent):

        main_reader = DirectoryReader(
            input_files=input_files,
-            file_extractor=file_extractors,  # type: ignore
+            file_extractor=file_extractors,
        )

        return main_reader
@@ -85,7 +85,9 @@ class DocumentIngestor(BaseComponent):
            file_paths = [file_paths]

        documents = self._get_reader(input_files=file_paths)()
+        print(f"Read {len(file_paths)} files into {len(documents)} documents.")
        nodes = self.text_splitter(documents)
+        print(f"Transform {len(documents)} documents into {len(nodes)} nodes.")
        self.log_progress(".num_docs", num_docs=len(nodes))

        # document parsers call
--- a/libs/kotaemon/kotaemon/indices/vectorindex.py
+++ b/libs/kotaemon/kotaemon/indices/vectorindex.py
@@ -59,12 +59,15 @@ class VectorIndexing(BaseIndexing):
                    f"Invalid input type {type(item)}, should be str or Document"
                )

+        print(f"Getting embeddings for {len(input_)} nodes")
        embeddings = self.embedding(input_)
+        print("Adding embeddings to vector store")
        self.vector_store.add(
            embeddings=embeddings,
            ids=[t.doc_id for t in input_],
        )
        if self.doc_store:
+            print("Adding documents to doc store")
            self.doc_store.add(input_)


--- a/libs/kotaemon/kotaemon/loaders/ocr_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/ocr_loader.py
@@ -1,18 +1,34 @@
+import logging
+import os
 from pathlib import Path
 from typing import List, Optional
 from uuid import uuid4

 import requests
 from llama_index.readers.base import BaseReader
+from tenacity import after_log, retry, stop_after_attempt, wait_fixed, wait_random

 from kotaemon.base import Document

 from .utils.pdf_ocr import parse_ocr_output, read_pdf_unstructured
 from .utils.table import strip_special_chars_markdown

+logger = logging.getLogger(__name__)
+
 DEFAULT_OCR_ENDPOINT = "http://127.0.0.1:8000/v2/ai/infer/"


+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_fixed(5) + wait_random(0, 2),
+    after=after_log(logger, logging.DEBUG),
+)
+def tenacious_api_post(url, **kwargs):
+    resp = requests.post(url=url, **kwargs)
+    resp.raise_for_status()
+    return resp
+
+
 class OCRReader(BaseReader):
    """Read PDF using OCR, with high focus on table extraction

@@ -24,17 +40,20 @@ class OCRReader(BaseReader):
        ```

    Args:
-        endpoint: URL to FullOCR endpoint. Defaults to
+        endpoint: URL to FullOCR endpoint. If not provided, will look for
+            environment variable `OCR_READER_ENDPOINT` or use the default
            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`
            (http://127.0.0.1:8000/v2/ai/infer/)
        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF
            If False, only the table and text within table cells will be extracted.
    """

-    def __init__(self, endpoint: str = DEFAULT_OCR_ENDPOINT, use_ocr=True):
+    def __init__(self, endpoint: Optional[str] = None, use_ocr=True):
        """Init the OCR reader with OCR endpoint (FullOCR pipeline)"""
        super().__init__()
-        self.ocr_endpoint = endpoint
+        self.ocr_endpoint = endpoint or os.getenv(
+            "OCR_READER_ENDPOINT", DEFAULT_OCR_ENDPOINT
+        )
        self.use_ocr = use_ocr

    def load_data(
@@ -62,7 +81,7 @@ class OCRReader(BaseReader):
                ocr_results = kwargs["response_content"]
            else:
                # call original API
-                resp = requests.post(url=self.ocr_endpoint, files=files, data=data)
+                resp = tenacious_api_post(url=self.ocr_endpoint, files=files, data=data)
                ocr_results = resp.json()["result"]

        debug_path = kwargs.pop("debug_path", None)
--- a/libs/kotaemon/pyproject.toml
+++ b/libs/kotaemon/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
    "click",
    "pandas",
    "trogon",
+    "tenacity",
 ]
 readme = "README.md"
 license = { text = "MIT License" }