Add Azure AI Document Intelligence loader (#52)

* Add azureai document intelligence loader

* Add load_data interface to Azure DI

* Bump version

* Access azure credentials from environment variables
This commit is contained in:
Duc Nguyen (john) 2024-04-29 14:49:55 +07:00 committed by GitHub
parent bbe862fe47
commit ec11b54ff2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 86 additions and 1 deletions

View File

@ -1,4 +1,5 @@
from .adobe_loader import AdobeReader
from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
from .base import AutoReader, BaseReader
from .composite_loader import DirectoryReader
from .docx_loader import DocxReader
@ -10,6 +11,7 @@ from .unstructured_loader import UnstructuredReader
__all__ = [
"AutoReader",
"AzureAIDocumentIntelligenceLoader",
"BaseReader",
"PandasExcelReader",
"MathpixPDFReader",

View File

@ -0,0 +1,68 @@
import os
from pathlib import Path
from typing import Optional
from kotaemon.base import Document, Param
from .base import BaseReader
class AzureAIDocumentIntelligenceLoader(BaseReader):
"""Utilize Azure AI Document Intelligence to parse document
As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,
heif, docx, xlsx, pptx and html.
"""
_dependencies = ["azure-ai-documentintelligence"]
endpoint: str = Param(
os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT", None),
help="Endpoint of Azure AI Document Intelligence",
)
credential: str = Param(
os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL", None),
help="Credential of Azure AI Document Intelligence",
)
model: str = Param(
"prebuilt-layout",
help=(
"Model to use for document analysis. Default is prebuilt-layout. "
"As of April 24, you can view the supported models [here]"
"(https://learn.microsoft.com/en-us/azure/ai-services/"
"document-intelligence/concept-model-overview?view=doc-intel-4.0.0"
"#model-analysis-features)"
),
)
@Param.auto(depends_on=["endpoint", "credential"])
def client_(self):
try:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
except ImportError:
raise ImportError("Please install azure-ai-documentintelligence")
return DocumentIntelligenceClient(
self.endpoint, AzureKeyCredential(self.credential)
)
def run(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> list[Document]:
return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
def load_data(
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
) -> list[Document]:
metadata = extra_info or {}
with open(file_path, "rb") as fi:
poller = self.client_.begin_analyze_document(
self.model,
analyze_request=fi,
content_type="application/octet-stream",
output_content_format="markdown",
)
result = poller.result()
return [Document(content=result.content, metadata=metadata)]

View File

@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
# metadata and dependencies
[project]
name = "kotaemon"
version = "0.3.11"
version = "0.3.12"
requires-python = ">= 3.10"
description = "Kotaemon core library for AI development."
dependencies = [
@ -64,6 +64,7 @@ adv = [
"pdfservices-sdk @ git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements",
"fastembed",
"beautifulsoup4",
"azure-ai-documentintelligence",
]
dev = [
"ipython",

View File

@ -1,4 +1,5 @@
from pathlib import Path
from unittest.mock import patch
from langchain.schema import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser
@ -6,6 +7,7 @@ from llama_index.node_parser import SimpleNodeParser
from kotaemon.base import Document
from kotaemon.loaders import (
AutoReader,
AzureAIDocumentIntelligenceLoader,
DocxReader,
HtmlReader,
MhtmlReader,
@ -76,3 +78,15 @@ def test_mhtml_reader():
assert len(docs) == 1
assert docs[0].text.startswith("This is a test")
@patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_azureai_document_intelligence_reader(mock_client):
reader = AzureAIDocumentIntelligenceLoader(
endpoint="https://endpoint.com",
credential="credential",
)
docs = reader(Path(__file__).parent / "resources" / "dummy.pdf")
assert len(docs) == 1
mock_client.assert_called_once()