Best docs Cinnamon will probably ever have (#105)

This commit is contained in:
ian_Cin
2023-12-20 11:30:25 +07:00
committed by GitHub
parent 0e30dcbb06
commit 230328c62f
40 changed files with 1036 additions and 46 deletions

View File

@@ -93,7 +93,7 @@ def get_rect_iou(gt_box: List[tuple], pd_box: List[tuple], iou_type=0) -> int:
# compute the intersection over union by taking the intersection
# area and dividing it by the sum of prediction + ground-truth
# areas - the interesection area
# areas - the intersection area
if iou_type == 0:
iou = interArea / float(gt_area + pd_area - interArea)
elif iou_type == 1:

View File

@@ -34,8 +34,7 @@ def read_pdf_unstructured(input_path: Union[Path, str]):
from unstructured.partition.auto import partition
except ImportError:
raise ImportError(
"Please install unstructured PDF reader \
`pip install unstructured[pdf]`"
"Please install unstructured PDF reader `pip install unstructured[pdf]`"
)
page_items = defaultdict(list)
@@ -60,7 +59,7 @@ def read_pdf_unstructured(input_path: Union[Path, str]):
def merge_ocr_and_pdf_texts(
ocr_list: List[dict], pdf_text_list: List[dict], debug_info=None
):
"""Merge PDF and OCR text using IOU overlaping location
"""Merge PDF and OCR text using IOU overlapping location
Args:
ocr_list: List of OCR items {"text", "box", "location"}
pdf_text_list: List of PDF items {"text", "box", "location"}
@@ -115,7 +114,7 @@ def merge_ocr_and_pdf_texts(
def merge_table_cell_and_ocr(
table_list: List[dict], ocr_list: List[dict], pdf_list: List[dict], debug_info=None
):
"""Merge table items with OCR text using IOU overlaping location
"""Merge table items with OCR text using IOU overlapping location
Args:
table_list: List of table items
"type": ("table", "cell", "text"), "text", "box", "location"}
@@ -123,7 +122,7 @@ def merge_table_cell_and_ocr(
pdf_list: List of PDF items {"text", "box", "location"}
Returns:
all_table_cells: List of tables, each of table is reprented
all_table_cells: List of tables, each of table is represented
by list of cells with combined text from OCR
not_matched_items: List of PDF text which is not overlapped by table region
"""