Add new OCRReader with PDF+OCR text merging (#66)

This change speeds up OCR extraction by allowing bypassing OCR for texts that are irrelevant (not in table). --------- Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
2023-11-13 17:43:02 +07:00
parent d79b3744cb
commit 4704e2c11a
10 changed files with 523 additions and 126 deletions
--- a/tests/resources/7810d908b0ff4ce381dcab873196d133.jpg
+++ b/tests/resources/7810d908b0ff4ce381dcab873196d133.jpg
--- a/tests/resources/fullocr_sample_output.json
+++ b/tests/resources/fullocr_sample_output.json
--- a/tests/resources/table.pdf
+++ b/tests/resources/table.pdf
--- a/tests/test_table_reader.py
+++ b/tests/test_table_reader.py
@@ -5,7 +5,7 @@ import pytest

 from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader

-input_file = Path(__file__).parent / "resources" / "dummy.pdf"
+input_file = Path(__file__).parent / "resources" / "table.pdf"
 input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"


@@ -30,7 +30,7 @@ def test_ocr_reader(fullocr_output):
    reader = OCRReader()
    documents = reader.load_data(input_file, response_content=fullocr_output)
    table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
-    assert len(table_docs) == 4
+    assert len(table_docs) == 2


 def test_mathpix_reader(mathpix_output):