Add new OCRReader with PDF+OCR text merging (#66)
This change speeds up OCR extraction by allowing bypassing OCR for texts that are irrelevant (not in table). --------- Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
This commit is contained in:
committed by
GitHub
parent
d79b3744cb
commit
4704e2c11a
BIN
tests/resources/7810d908b0ff4ce381dcab873196d133.jpg
Normal file
BIN
tests/resources/7810d908b0ff4ce381dcab873196d133.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 322 KiB |
File diff suppressed because one or more lines are too long
BIN
tests/resources/table.pdf
Normal file
BIN
tests/resources/table.pdf
Normal file
Binary file not shown.
@@ -5,7 +5,7 @@ import pytest
|
||||
|
||||
from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
|
||||
|
||||
input_file = Path(__file__).parent / "resources" / "dummy.pdf"
|
||||
input_file = Path(__file__).parent / "resources" / "table.pdf"
|
||||
input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ def test_ocr_reader(fullocr_output):
|
||||
reader = OCRReader()
|
||||
documents = reader.load_data(input_file, response_content=fullocr_output)
|
||||
table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
|
||||
assert len(table_docs) == 4
|
||||
assert len(table_docs) == 2
|
||||
|
||||
|
||||
def test_mathpix_reader(mathpix_output):
|
||||
|
Reference in New Issue
Block a user