From ec11b54ff22a667b86f71ef89768045d595fbfe6 Mon Sep 17 00:00:00 2001 From: "Duc Nguyen (john)" Date: Mon, 29 Apr 2024 14:49:55 +0700 Subject: [PATCH] Add Azure AI Document Intelligence loader (#52) * Add azureai document intelligence loader * Add load_data interface to Azure DI * Bump version * Access azure credentials from environment variables --- libs/kotaemon/kotaemon/loaders/__init__.py | 2 + .../azureai_document_intelligence_loader.py | 68 +++++++++++++++++++ libs/kotaemon/pyproject.toml | 3 +- libs/kotaemon/tests/test_reader.py | 14 ++++ 4 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py index f0e7d0f..6ccdbda 100644 --- a/libs/kotaemon/kotaemon/loaders/__init__.py +++ b/libs/kotaemon/kotaemon/loaders/__init__.py @@ -1,4 +1,5 @@ from .adobe_loader import AdobeReader +from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader from .base import AutoReader, BaseReader from .composite_loader import DirectoryReader from .docx_loader import DocxReader @@ -10,6 +11,7 @@ from .unstructured_loader import UnstructuredReader __all__ = [ "AutoReader", + "AzureAIDocumentIntelligenceLoader", "BaseReader", "PandasExcelReader", "MathpixPDFReader", diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py new file mode 100644 index 0000000..7e4c516 --- /dev/null +++ b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py @@ -0,0 +1,68 @@ +import os +from pathlib import Path +from typing import Optional + +from kotaemon.base import Document, Param + +from .base import BaseReader + + +class AzureAIDocumentIntelligenceLoader(BaseReader): + """Utilize Azure AI Document Intelligence to parse document + + As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, + heif, docx, xlsx, pptx and html. + """ + + _dependencies = ["azure-ai-documentintelligence"] + + endpoint: str = Param( + os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT", None), + help="Endpoint of Azure AI Document Intelligence", + ) + credential: str = Param( + os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL", None), + help="Credential of Azure AI Document Intelligence", + ) + model: str = Param( + "prebuilt-layout", + help=( + "Model to use for document analysis. Default is prebuilt-layout. " + "As of April 24, you can view the supported models [here]" + "(https://learn.microsoft.com/en-us/azure/ai-services/" + "document-intelligence/concept-model-overview?view=doc-intel-4.0.0" + "#model-analysis-features)" + ), + ) + + @Param.auto(depends_on=["endpoint", "credential"]) + def client_(self): + try: + from azure.ai.documentintelligence import DocumentIntelligenceClient + from azure.core.credentials import AzureKeyCredential + except ImportError: + raise ImportError("Please install azure-ai-documentintelligence") + + return DocumentIntelligenceClient( + self.endpoint, AzureKeyCredential(self.credential) + ) + + def run( + self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs + ) -> list[Document]: + return self.load_data(Path(file_path), extra_info=extra_info, **kwargs) + + def load_data( + self, file_path: Path, extra_info: Optional[dict] = None, **kwargs + ) -> list[Document]: + metadata = extra_info or {} + with open(file_path, "rb") as fi: + poller = self.client_.begin_analyze_document( + self.model, + analyze_request=fi, + content_type="application/octet-stream", + output_content_format="markdown", + ) + result = poller.result() + + return [Document(content=result.content, metadata=metadata)] diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index 43678b4..b218b04 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"] # metadata and dependencies [project] name = "kotaemon" -version = "0.3.11" +version = "0.3.12" requires-python = ">= 3.10" description = "Kotaemon core library for AI development." dependencies = [ @@ -64,6 +64,7 @@ adv = [ "pdfservices-sdk @ git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements", "fastembed", "beautifulsoup4", + "azure-ai-documentintelligence", ] dev = [ "ipython", diff --git a/libs/kotaemon/tests/test_reader.py b/libs/kotaemon/tests/test_reader.py index 0aa2f2b..0cdfa51 100644 --- a/libs/kotaemon/tests/test_reader.py +++ b/libs/kotaemon/tests/test_reader.py @@ -1,4 +1,5 @@ from pathlib import Path +from unittest.mock import patch from langchain.schema import Document as LangchainDocument from llama_index.node_parser import SimpleNodeParser @@ -6,6 +7,7 @@ from llama_index.node_parser import SimpleNodeParser from kotaemon.base import Document from kotaemon.loaders import ( AutoReader, + AzureAIDocumentIntelligenceLoader, DocxReader, HtmlReader, MhtmlReader, @@ -76,3 +78,15 @@ def test_mhtml_reader(): assert len(docs) == 1 assert docs[0].text.startswith("This is a test") + + +@patch("azure.ai.documentintelligence.DocumentIntelligenceClient") +def test_azureai_document_intelligence_reader(mock_client): + reader = AzureAIDocumentIntelligenceLoader( + endpoint="https://endpoint.com", + credential="credential", + ) + docs = reader(Path(__file__).parent / "resources" / "dummy.pdf") + + assert len(docs) == 1 + mock_client.assert_called_once()