diff --git a/README.md b/README.md index adcb05a..7e796ad 100644 --- a/README.md +++ b/README.md @@ -216,6 +216,17 @@ documents and developers who want to build their own RAG pipeline. See [Local model setup](docs/local_model.md). +### Setup multimodal document parsing (OCR, table parsing, figure extraction) + +These options are available: + +- [Azure Document Intelligence (API)](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) +- [Adobe PDF Extract (API)](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/) +- [Docling (local, open-source)](https://github.com/DS4SD/docling) + - To use Docling, first install required dependencies: `pip install docling` + +Select corresponding loaders in `Settings -> Retrieval Settings -> File loader` + ### Customize your application - By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine. diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py index d82bc16..18db7ca 100644 --- a/libs/kotaemon/kotaemon/indices/ingests/files.py +++ b/libs/kotaemon/kotaemon/indices/ingests/files.py @@ -13,6 +13,7 @@ from kotaemon.loaders import ( AdobeReader, AzureAIDocumentIntelligenceLoader, DirectoryReader, + DoclingReader, HtmlReader, MathpixPDFReader, MhtmlReader, @@ -32,9 +33,10 @@ azure_reader = AzureAIDocumentIntelligenceLoader( credential=str(config("AZURE_DI_CREDENTIAL", default="")), cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None), ) -adobe_reader.vlm_endpoint = azure_reader.vlm_endpoint = getattr( - flowsettings, "KH_VLM_ENDPOINT", "" -) +docling_reader = DoclingReader() +adobe_reader.vlm_endpoint = ( + azure_reader.vlm_endpoint +) = docling_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "") KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = { diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py index 0c3fbcb..f498da8 100644 --- a/libs/kotaemon/kotaemon/loaders/__init__.py +++ b/libs/kotaemon/kotaemon/loaders/__init__.py @@ -2,6 +2,7 @@ from .adobe_loader import AdobeReader from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader from .base import AutoReader, BaseReader from .composite_loader import DirectoryReader +from .docling_loader import DoclingReader from .docx_loader import DocxReader from .excel_loader import ExcelReader, PandasExcelReader from .html_loader import HtmlReader, MhtmlReader @@ -30,4 +31,5 @@ __all__ = [ "TxtReader", "PDFThumbnailReader", "WebReader", + "DoclingReader", ] diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py index 1e1d902..0e66c84 100644 --- a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py +++ b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py @@ -25,6 +25,9 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag """ left, upper, right, lower = bbox + left, right = min(left, right), max(left, right) + upper, lower = min(upper, lower), max(upper, lower) + img: Image.Image suffix = file_path.suffix.lower() if suffix == ".pdf": diff --git a/libs/kotaemon/kotaemon/loaders/docling_loader.py b/libs/kotaemon/kotaemon/loaders/docling_loader.py new file mode 100644 index 0000000..2f2b43f --- /dev/null +++ b/libs/kotaemon/kotaemon/loaders/docling_loader.py @@ -0,0 +1,232 @@ +import base64 +from collections import defaultdict +from io import BytesIO +from pathlib import Path +from typing import List, Optional + +from kotaemon.base import Document, Param + +from .azureai_document_intelligence_loader import crop_image +from .base import BaseReader +from .utils.adobe import generate_single_figure_caption, make_markdown_table + + +class DoclingReader(BaseReader): + """Using Docling to extract document structure and content""" + + _dependencies = ["docling"] + + vlm_endpoint: str = Param( + help=( + "Default VLM endpoint for figure captioning. " + "If not provided, will not caption the figures" + ) + ) + + max_figure_to_caption: int = Param( + 100, + help=( + "The maximum number of figures to caption. " + "The rest will be indexed without captions." + ), + ) + + figure_friendly_filetypes: list[str] = Param( + [".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"], + help=( + "File types that we can reliably open and extract figures. " + "For files like .docx or .html, the visual layout may be different " + "when viewed from different tools, hence we cannot use Azure DI location " + "to extract figures." + ), + ) + + @Param.auto(cache=True) + def converter_(self): + try: + from docling.document_converter import DocumentConverter + except ImportError: + raise ImportError("Please install docling: 'pip install docling'") + + return DocumentConverter() + + def run( + self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs + ) -> List[Document]: + return self.load_data(file_path, extra_info, **kwargs) + + def load_data( + self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs + ) -> List[Document]: + """Extract the input file, allowing multi-modal extraction""" + + metadata = extra_info or {} + + result = self.converter_.convert(file_path) + result_dict = result.document.export_to_dict() + + file_path = Path(file_path) + file_name = file_path.name + + # extract the figures + figures = [] + gen_caption_count = 0 + for figure_obj in result_dict.get("pictures", []): + if not self.vlm_endpoint: + continue + if file_path.suffix.lower() not in self.figure_friendly_filetypes: + continue + + # retrieve extractive captions provided by docling + caption_refs = [caption["$ref"] for caption in figure_obj["captions"]] + extractive_captions = [] + for caption_ref in caption_refs: + text_id = caption_ref.split("/")[-1] + try: + caption_text = result_dict["texts"][int(text_id)]["text"] + extractive_captions.append(caption_text) + except (ValueError, TypeError, IndexError) as e: + print(e) + continue + + # read & crop image + page_number = figure_obj["prov"][0]["page_no"] + + try: + page_number_text = str(page_number) + page_width = result_dict["pages"][page_number_text]["size"]["width"] + page_height = result_dict["pages"][page_number_text]["size"]["height"] + + bbox_obj = figure_obj["prov"][0]["bbox"] + bbox: list[float] = [ + bbox_obj["l"], + bbox_obj["t"], + bbox_obj["r"], + bbox_obj["b"], + ] + if bbox_obj["coord_origin"] == "BOTTOMLEFT": + bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height) + + img = crop_image(file_path, bbox, page_number - 1) + except KeyError as e: + print(e, list(result_dict["pages"].keys())) + continue + + # convert img to base64 + img_bytes = BytesIO() + img.save(img_bytes, format="PNG") + img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8") + img_base64 = f"data:image/png;base64,{img_base64}" + + # generate the generative caption + if gen_caption_count >= self.max_figure_to_caption: + gen_caption = "" + else: + gen_caption_count += 1 + gen_caption = generate_single_figure_caption( + img_base64, self.vlm_endpoint + ) + + # join the extractive and generative captions + caption = "\n".join(extractive_captions + [gen_caption]) + + # store the image into document + figure_metadata = { + "image_origin": img_base64, + "type": "image", + "page_label": page_number, + "file_name": file_name, + "file_path": file_path, + } + figure_metadata.update(metadata) + + figures.append( + Document( + text=caption, + metadata=figure_metadata, + ) + ) + + # extract the tables + tables = [] + for table_obj in result_dict.get("tables", []): + # convert the tables into markdown format + markdown_table = self._parse_table(table_obj) + caption_refs = [caption["$ref"] for caption in table_obj["captions"]] + + extractive_captions = [] + for caption_ref in caption_refs: + text_id = caption_ref.split("/")[-1] + try: + caption_text = result_dict["texts"][int(text_id)]["text"] + extractive_captions.append(caption_text) + except (ValueError, TypeError, IndexError) as e: + print(e) + continue + # join the extractive and generative captions + caption = "\n".join(extractive_captions) + markdown_table = f"{caption}\n{markdown_table}" + + page_number = table_obj["prov"][0].get("page_no", 1) + + table_metadata = { + "type": "table", + "page_label": page_number, + "table_origin": markdown_table, + "file_name": file_name, + "file_path": file_path, + } + table_metadata.update(metadata) + + tables.append( + Document( + text=markdown_table, + metadata=table_metadata, + ) + ) + + # join plain text elements + texts = [] + page_number_to_text = defaultdict(list) + + for text_obj in result_dict["texts"]: + page_number = text_obj["prov"][0].get("page_no", 1) + page_number_to_text[page_number].append(text_obj["text"]) + + for page_number, txts in page_number_to_text.items(): + texts.append( + Document( + text="\n".join(txts), + metadata={ + "page_label": page_number, + "file_name": file_name, + "file_path": file_path, + **metadata, + }, + ) + ) + + return texts + tables + figures + + def _convert_bbox_bl_tl( + self, bbox: list[float], page_width: int, page_height: int + ) -> list[float]: + """Convert bbox from bottom-left to top-left""" + x0, y0, x1, y1 = bbox + return [ + x0 / page_width, + (page_height - y1) / page_height, + x1 / page_width, + (page_height - y0) / page_height, + ] + + def _parse_table(self, table_obj: dict) -> str: + """Convert docling table object to markdown table""" + table_as_list: List[List[str]] = [] + grid = table_obj["data"]["grid"] + for row in grid: + table_as_list.append([]) + for cell in row: + table_as_list[-1].append(cell["text"]) + + return make_markdown_table(table_as_list) diff --git a/libs/kotaemon/kotaemon/loaders/utils/adobe.py b/libs/kotaemon/kotaemon/loaders/utils/adobe.py index f1adcd5..24b3d9d 100644 --- a/libs/kotaemon/kotaemon/loaders/utils/adobe.py +++ b/libs/kotaemon/kotaemon/loaders/utils/adobe.py @@ -110,7 +110,7 @@ def request_adobe_service(file_path: str, output_path: str = "") -> str: return output_path -def make_markdown_table(table_as_list: List[str]) -> str: +def make_markdown_table(table_as_list: List[List[str]]) -> str: """ Convert table from python list representation to markdown format. The input list consists of rows of tables, the first row is the header. @@ -203,17 +203,21 @@ def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]: def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str: + output = "" + """Summarize a single figure using GPT-4V""" if figure: - output = generate_gpt4v( - endpoint=vlm_endpoint, - prompt="Provide a short 2 sentence summary of this image?", - images=figure, - ) - if "sorry" in output.lower(): - output = "" - else: - output = "" + try: + output = generate_gpt4v( + endpoint=vlm_endpoint, + prompt="Provide a short 2 sentence summary of this image?", + images=figure, + ) + if "sorry" in output.lower(): + output = "" + except Exception as e: + print(f"Error generating caption: {e}") + return output diff --git a/libs/ktem/ktem/index/file/pipelines.py b/libs/ktem/ktem/index/file/pipelines.py index 4898a9d..afe90aa 100644 --- a/libs/ktem/ktem/index/file/pipelines.py +++ b/libs/ktem/ktem/index/file/pipelines.py @@ -39,6 +39,7 @@ from kotaemon.indices.ingests.files import ( KH_DEFAULT_FILE_EXTRACTORS, adobe_reader, azure_reader, + docling_reader, unstructured, web_reader, ) @@ -673,6 +674,8 @@ class IndexDocumentPipeline(BaseFileIndexIndexing): readers[".pdf"] = adobe_reader elif self.reader_mode == "azure-di": readers[".pdf"] = azure_reader + elif self.reader_mode == "docling": + readers[".pdf"] = docling_reader dev_readers, _, _ = dev_settings() readers.update(dev_readers) @@ -692,6 +695,7 @@ class IndexDocumentPipeline(BaseFileIndexIndexing): "Azure AI Document Intelligence (figure+table extraction)", "azure-di", ), + ("Docling", "docling"), ], "component": "dropdown", },