Add new OCRReader with PDF+OCR text merging (#66)

This change speeds up OCR extraction by allowing bypassing OCR for texts that are irrelevant (not in table).

---------

Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
This commit is contained in:
Tuan Anh Nguyen Dang (Tadashi_Cin)
2023-11-13 17:43:02 +07:00
committed by GitHub
parent d79b3744cb
commit 4704e2c11a
10 changed files with 523 additions and 126 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 322 KiB

File diff suppressed because one or more lines are too long

BIN
tests/resources/table.pdf Normal file

Binary file not shown.

View File

@@ -5,7 +5,7 @@ import pytest
from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
input_file = Path(__file__).parent / "resources" / "dummy.pdf"
input_file = Path(__file__).parent / "resources" / "table.pdf"
input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"
@@ -30,7 +30,7 @@ def test_ocr_reader(fullocr_output):
reader = OCRReader()
documents = reader.load_data(input_file, response_content=fullocr_output)
table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
assert len(table_docs) == 4
assert len(table_docs) == 2
def test_mathpix_reader(mathpix_output):