Add Azure AI Document Intelligence loader (#52)

* Add azureai document intelligence loader * Add load_data interface to Azure DI * Bump version * Access azure credentials from environment variables
2024-04-29 14:49:55 +07:00 · 2024-04-29 14:49:55 +07:00 · ec11b54ff2
commit ec11b54ff2
parent bbe862fe47
4 changed files with 86 additions and 1 deletions
--- a/libs/kotaemon/kotaemon/loaders/init.py
+++ b/libs/kotaemon/kotaemon/loaders/init.py
@ -1,4 +1,5 @@
 from .adobe_loader import AdobeReader
+from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
 from .base import AutoReader, BaseReader
 from .composite_loader import DirectoryReader
 from .docx_loader import DocxReader
@ -10,6 +11,7 @@ from .unstructured_loader import UnstructuredReader

 __all__ = [
    "AutoReader",
+    "AzureAIDocumentIntelligenceLoader",
    "BaseReader",
    "PandasExcelReader",
    "MathpixPDFReader",
--- a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
@ -0,0 +1,68 @@
+import os
+from pathlib import Path
+from typing import Optional
+
+from kotaemon.base import Document, Param
+
+from .base import BaseReader
+
+
+class AzureAIDocumentIntelligenceLoader(BaseReader):
+    """Utilize Azure AI Document Intelligence to parse document
+
+    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,
+    heif, docx, xlsx, pptx and html.
+    """
+
+    _dependencies = ["azure-ai-documentintelligence"]
+
+    endpoint: str = Param(
+        os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT", None),
+        help="Endpoint of Azure AI Document Intelligence",
+    )
+    credential: str = Param(
+        os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL", None),
+        help="Credential of Azure AI Document Intelligence",
+    )
+    model: str = Param(
+        "prebuilt-layout",
+        help=(
+            "Model to use for document analysis. Default is prebuilt-layout. "
+            "As of April 24, you can view the supported models [here]"
+            "(https://learn.microsoft.com/en-us/azure/ai-services/"
+            "document-intelligence/concept-model-overview?view=doc-intel-4.0.0"
+            "#model-analysis-features)"
+        ),
+    )
+
+    @Param.auto(depends_on=["endpoint", "credential"])
+    def client_(self):
+        try:
+            from azure.ai.documentintelligence import DocumentIntelligenceClient
+            from azure.core.credentials import AzureKeyCredential
+        except ImportError:
+            raise ImportError("Please install azure-ai-documentintelligence")
+
+        return DocumentIntelligenceClient(
+            self.endpoint, AzureKeyCredential(self.credential)
+        )
+
+    def run(
+        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> list[Document]:
+        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
+
+    def load_data(
+        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> list[Document]:
+        metadata = extra_info or {}
+        with open(file_path, "rb") as fi:
+            poller = self.client_.begin_analyze_document(
+                self.model,
+                analyze_request=fi,
+                content_type="application/octet-stream",
+                output_content_format="markdown",
+            )
+            result = poller.result()
+
+        return [Document(content=result.content, metadata=metadata)]
--- a/libs/kotaemon/pyproject.toml
+++ b/libs/kotaemon/pyproject.toml
@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
 # metadata and dependencies
 [project]
 name = "kotaemon"
-version = "0.3.11"
+version = "0.3.12"
 requires-python = ">= 3.10"
 description = "Kotaemon core library for AI development."
 dependencies = [
@ -64,6 +64,7 @@ adv = [
    "pdfservices-sdk @  git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements",
    "fastembed",
    "beautifulsoup4",
+    "azure-ai-documentintelligence",
 ]
 dev = [
    "ipython",
--- a/libs/kotaemon/tests/test_reader.py
+++ b/libs/kotaemon/tests/test_reader.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from unittest.mock import patch

 from langchain.schema import Document as LangchainDocument
 from llama_index.node_parser import SimpleNodeParser
@ -6,6 +7,7 @@ from llama_index.node_parser import SimpleNodeParser
 from kotaemon.base import Document
 from kotaemon.loaders import (
    AutoReader,
+    AzureAIDocumentIntelligenceLoader,
    DocxReader,
    HtmlReader,
    MhtmlReader,
@ -76,3 +78,15 @@ def test_mhtml_reader():

    assert len(docs) == 1
    assert docs[0].text.startswith("This is a test")
+
+
+@patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
+def test_azureai_document_intelligence_reader(mock_client):
+    reader = AzureAIDocumentIntelligenceLoader(
+        endpoint="https://endpoint.com",
+        credential="credential",
+    )
+    docs = reader(Path(__file__).parent / "resources" / "dummy.pdf")
+
+    assert len(docs) == 1
+    mock_client.assert_called_once()