Best docs Cinnamon will probably ever have (#105)
This commit is contained in:
@@ -4,7 +4,7 @@ from typing import Any, List, Type, Union
|
||||
from llama_index import SimpleDirectoryReader, download_loader
|
||||
from llama_index.readers.base import BaseReader
|
||||
|
||||
from ..base import BaseComponent, Document
|
||||
from kotaemon.base import BaseComponent, Document
|
||||
|
||||
|
||||
class AutoReader(BaseComponent):
|
||||
|
@@ -93,7 +93,7 @@ def get_rect_iou(gt_box: List[tuple], pd_box: List[tuple], iou_type=0) -> int:
|
||||
|
||||
# compute the intersection over union by taking the intersection
|
||||
# area and dividing it by the sum of prediction + ground-truth
|
||||
# areas - the interesection area
|
||||
# areas - the intersection area
|
||||
if iou_type == 0:
|
||||
iou = interArea / float(gt_area + pd_area - interArea)
|
||||
elif iou_type == 1:
|
||||
|
@@ -34,8 +34,7 @@ def read_pdf_unstructured(input_path: Union[Path, str]):
|
||||
from unstructured.partition.auto import partition
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install unstructured PDF reader \
|
||||
`pip install unstructured[pdf]`"
|
||||
"Please install unstructured PDF reader `pip install unstructured[pdf]`"
|
||||
)
|
||||
|
||||
page_items = defaultdict(list)
|
||||
@@ -60,7 +59,7 @@ def read_pdf_unstructured(input_path: Union[Path, str]):
|
||||
def merge_ocr_and_pdf_texts(
|
||||
ocr_list: List[dict], pdf_text_list: List[dict], debug_info=None
|
||||
):
|
||||
"""Merge PDF and OCR text using IOU overlaping location
|
||||
"""Merge PDF and OCR text using IOU overlapping location
|
||||
Args:
|
||||
ocr_list: List of OCR items {"text", "box", "location"}
|
||||
pdf_text_list: List of PDF items {"text", "box", "location"}
|
||||
@@ -115,7 +114,7 @@ def merge_ocr_and_pdf_texts(
|
||||
def merge_table_cell_and_ocr(
|
||||
table_list: List[dict], ocr_list: List[dict], pdf_list: List[dict], debug_info=None
|
||||
):
|
||||
"""Merge table items with OCR text using IOU overlaping location
|
||||
"""Merge table items with OCR text using IOU overlapping location
|
||||
Args:
|
||||
table_list: List of table items
|
||||
"type": ("table", "cell", "text"), "text", "box", "location"}
|
||||
@@ -123,7 +122,7 @@ def merge_table_cell_and_ocr(
|
||||
pdf_list: List of PDF items {"text", "box", "location"}
|
||||
|
||||
Returns:
|
||||
all_table_cells: List of tables, each of table is reprented
|
||||
all_table_cells: List of tables, each of table is represented
|
||||
by list of cells with combined text from OCR
|
||||
not_matched_items: List of PDF text which is not overlapped by table region
|
||||
"""
|
||||
|
Reference in New Issue
Block a user