Add docstring for database and OCR loader

2024-02-20 21:20:48 +07:00
parent 767aaaa1ef
commit 08cc99d8db
2 changed files with 75 additions and 14 deletions
--- a/libs/kotaemon/kotaemon/loaders/ocr_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/ocr_loader.py
@@ -14,14 +14,25 @@ DEFAULT_OCR_ENDPOINT = "http://127.0.0.1:8000/v2/ai/infer/"


 class OCRReader(BaseReader):
-    def __init__(self, endpoint: str = DEFAULT_OCR_ENDPOINT, use_ocr=True):
-        """Init the OCR reader with OCR endpoint (FullOCR pipeline)
+    """Read PDF using OCR, with high focus on table extraction

-        Args:
-            endpoint: URL to FullOCR endpoint. Defaults to OCR_ENDPOINT.
-            use_ocr: whether to use OCR to read text
-                (e.g: from images, tables) in the PDF
-        """
+    Example:
+        ```python
+        >> from kotaemon.loaders import OCRReader
+        >> reader = OCRReader()
+        >> documents = reader.load_data("path/to/pdf")
+        ```
+
+    Args:
+        endpoint: URL to FullOCR endpoint. Defaults to
+            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`
+            (http://127.0.0.1:8000/v2/ai/infer/)
+        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF
+            If False, only the table and text within table cells will be extracted.
+    """
+
+    def __init__(self, endpoint: str = DEFAULT_OCR_ENDPOINT, use_ocr=True):
+        """Init the OCR reader with OCR endpoint (FullOCR pipeline)"""
        super().__init__()
        self.ocr_endpoint = endpoint
        self.use_ocr = use_ocr