From ec11b54ff22a667b86f71ef89768045d595fbfe6 Mon Sep 17 00:00:00 2001
From: "Duc Nguyen (john)" <trungduc1992@gmail.com>
Date: Mon, 29 Apr 2024 14:49:55 +0700
Subject: [PATCH] Add Azure AI Document Intelligence loader (#52)

* Add azureai document intelligence loader

* Add load_data interface to Azure DI

* Bump version

* Access azure credentials from environment variables
---
 libs/kotaemon/kotaemon/loaders/__init__.py    |  2 +
 .../azureai_document_intelligence_loader.py   | 68 +++++++++++++++++++
 libs/kotaemon/pyproject.toml                  |  3 +-
 libs/kotaemon/tests/test_reader.py            | 14 ++++
 4 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py

diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py
index f0e7d0f..6ccdbda 100644
--- a/libs/kotaemon/kotaemon/loaders/__init__.py
+++ b/libs/kotaemon/kotaemon/loaders/__init__.py
@@ -1,4 +1,5 @@
 from .adobe_loader import AdobeReader
+from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
 from .base import AutoReader, BaseReader
 from .composite_loader import DirectoryReader
 from .docx_loader import DocxReader
@@ -10,6 +11,7 @@ from .unstructured_loader import UnstructuredReader
 
 __all__ = [
     "AutoReader",
+    "AzureAIDocumentIntelligenceLoader",
     "BaseReader",
     "PandasExcelReader",
     "MathpixPDFReader",
diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
new file mode 100644
index 0000000..7e4c516
--- /dev/null
+++ b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
@@ -0,0 +1,68 @@
+import os
+from pathlib import Path
+from typing import Optional
+
+from kotaemon.base import Document, Param
+
+from .base import BaseReader
+
+
+class AzureAIDocumentIntelligenceLoader(BaseReader):
+    """Utilize Azure AI Document Intelligence to parse document
+
+    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,
+    heif, docx, xlsx, pptx and html.
+    """
+
+    _dependencies = ["azure-ai-documentintelligence"]
+
+    endpoint: str = Param(
+        os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT", None),
+        help="Endpoint of Azure AI Document Intelligence",
+    )
+    credential: str = Param(
+        os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL", None),
+        help="Credential of Azure AI Document Intelligence",
+    )
+    model: str = Param(
+        "prebuilt-layout",
+        help=(
+            "Model to use for document analysis. Default is prebuilt-layout. "
+            "As of April 24, you can view the supported models [here]"
+            "(https://learn.microsoft.com/en-us/azure/ai-services/"
+            "document-intelligence/concept-model-overview?view=doc-intel-4.0.0"
+            "#model-analysis-features)"
+        ),
+    )
+
+    @Param.auto(depends_on=["endpoint", "credential"])
+    def client_(self):
+        try:
+            from azure.ai.documentintelligence import DocumentIntelligenceClient
+            from azure.core.credentials import AzureKeyCredential
+        except ImportError:
+            raise ImportError("Please install azure-ai-documentintelligence")
+
+        return DocumentIntelligenceClient(
+            self.endpoint, AzureKeyCredential(self.credential)
+        )
+
+    def run(
+        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> list[Document]:
+        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
+
+    def load_data(
+        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> list[Document]:
+        metadata = extra_info or {}
+        with open(file_path, "rb") as fi:
+            poller = self.client_.begin_analyze_document(
+                self.model,
+                analyze_request=fi,
+                content_type="application/octet-stream",
+                output_content_format="markdown",
+            )
+            result = poller.result()
+
+        return [Document(content=result.content, metadata=metadata)]
diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml
index 43678b4..b218b04 100644
--- a/libs/kotaemon/pyproject.toml
+++ b/libs/kotaemon/pyproject.toml
@@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
 # metadata and dependencies
 [project]
 name = "kotaemon"
-version = "0.3.11"
+version = "0.3.12"
 requires-python = ">= 3.10"
 description = "Kotaemon core library for AI development."
 dependencies = [
@@ -64,6 +64,7 @@ adv = [
     "pdfservices-sdk @  git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements",
     "fastembed",
     "beautifulsoup4",
+    "azure-ai-documentintelligence",
 ]
 dev = [
     "ipython",
diff --git a/libs/kotaemon/tests/test_reader.py b/libs/kotaemon/tests/test_reader.py
index 0aa2f2b..0cdfa51 100644
--- a/libs/kotaemon/tests/test_reader.py
+++ b/libs/kotaemon/tests/test_reader.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from unittest.mock import patch
 
 from langchain.schema import Document as LangchainDocument
 from llama_index.node_parser import SimpleNodeParser
@@ -6,6 +7,7 @@ from llama_index.node_parser import SimpleNodeParser
 from kotaemon.base import Document
 from kotaemon.loaders import (
     AutoReader,
+    AzureAIDocumentIntelligenceLoader,
     DocxReader,
     HtmlReader,
     MhtmlReader,
@@ -76,3 +78,15 @@ def test_mhtml_reader():
 
     assert len(docs) == 1
     assert docs[0].text.startswith("This is a test")
+
+
+@patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
+def test_azureai_document_intelligence_reader(mock_client):
+    reader = AzureAIDocumentIntelligenceLoader(
+        endpoint="https://endpoint.com",
+        credential="credential",
+    )
+    docs = reader(Path(__file__).parent / "resources" / "dummy.pdf")
+
+    assert len(docs) == 1
+    mock_client.assert_called_once()