Add docx + html reader (#139)

2024-01-31 19:21:30 +07:00
parent 116919b346
commit 65852b7d71
10 changed files with 202 additions and 10 deletions
--- a/.github/workflows/unit-test.yaml
+++ b/.github/workflows/unit-test.yaml
@@ -88,6 +88,7 @@ jobs:
          steps.check-cache-hit.outputs.check != 'true'
        run: |
          python -m pip install --upgrade pip
          cd libs/kotaemon
          pip install -U --upgrade-strategy eager -e .[dev]
      - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
--- a/README.md
+++ b/README.md
@@ -26,13 +26,6 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
  ```shell
  git clone git@github.com:Cinnamon/kotaemon.git
  cd kotaemon
  ```
 - Install all
  ```shell
  pip install -e ".[dev]"
  ```
 - Pre-commit
@@ -41,6 +34,13 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
  pre-commit install
  ```
 - Install all
  ```shell
  cd kotaemon/libs/kotaemon
  pip install -e ".[dev]"
  ```
 - Test
  ```shell
--- a/libs/kotaemon/kotaemon/loaders/init.py
+++ b/libs/kotaemon/kotaemon/loaders/init.py
@@ -1,5 +1,7 @@
 from .base import AutoReader, DirectoryReader
 from .docx_loader import DocxReader
 from .excel_loader import PandasExcelReader
 from .html_loader import HtmlReader
 from .mathpix_loader import MathpixPDFReader
 from .ocr_loader import OCRReader
 from .unstructured_loader import UnstructuredReader
@@ -11,4 +13,6 @@ __all__ = [
    "OCRReader",
    "DirectoryReader",
    "UnstructuredReader",
    "DocxReader",
    "HtmlReader",
 ]
--- a/libs/kotaemon/kotaemon/loaders/docx_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/docx_loader.py
@@ -0,0 +1,91 @@
 import unicodedata
 from pathlib import Path
 from typing import List, Optional
 import pandas as pd
 from llama_index.readers.base import BaseReader
 from kotaemon.base import Document
 class DocxReader(BaseReader):
    """Read Docx files that respect table, using python-docx library
    Reader behavior:
        - All paragraphs are extracted as a Document
        - Each table is extracted as a Document, rendered as a CSV string
        - The output is a list of Documents, concatenating the above
        (tables + paragraphs)
    """
    def __init__(self, *args, **kwargs):
        try:
            import docx
        except ImportError:
            raise ImportError(
                "docx is not installed. "
                "Please install it using `pip install python-docx`"
            )
        self._module = docx
    def load_data(
        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
    ) -> List[Document]:
        """Load data using Docx reader
        Args:
            file_path (Path): Path to PDF file
        Returns:
            List[Document]: list of documents extracted from the HTML file
        """
        file_path = Path(file_path).resolve()
        doc = self._module.Document(str(file_path))
        all_text = "\n".join(
            [unicodedata.normalize("NFKC", p.text) for p in doc.paragraphs]
        )
        pages = [all_text]  # 1 page only
        tables = []
        for t in doc.tables:
            arrays = [
                [
                    unicodedata.normalize("NFKC", t.cell(i, j).text)
                    for i in range(len(t.rows))
                ]
                for j in range(len(t.columns))
            ]
            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))
        extra_info = extra_info or {}
        # create output Document with metadata from table
        documents = [
            Document(
                text=table.to_csv(
                    index=False
                ).strip(),  # strip_special_chars_markdown()
                metadata={
                    "table_origin": table.to_csv(index=False),
                    "type": "table",
                    **extra_info,
                },
                metadata_template="",
                metadata_seperator="",
            )
            for table in tables  # page_id
        ]
        # create Document from non-table text
        documents.extend(
            [
                Document(
                    text=non_table_text.strip(),
                    metadata={"page_label": 1, **extra_info},
                )
                for _, non_table_text in enumerate(pages)
            ]
        )
        return documents
--- a/libs/kotaemon/kotaemon/loaders/html_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/html_loader.py
@@ -0,0 +1,77 @@
 import unicodedata
 from pathlib import Path
 from typing import List, Optional
 from llama_index.readers.base import BaseReader
 from kotaemon.base import Document
 class HtmlReader(BaseReader):
    """Reader HTML usimg html2text
    Reader behavior:
        - HTML is read with html2text.
        - All of the texts will be split by `page_break_pattern`
        - Each page is extracted as a Document
        - The output is a list of Documents
    Args:
        page_break_pattern (str): Pattern to split the HTML into pages
    """
    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):
        try:
            import html2text
        except ImportError:
            raise ImportError(
                "html2text is not installed. "
                "Please install it using `pip install html2text`"
            )
        self._module = html2text
        self._page_break_pattern: Optional[str] = page_break_pattern
        super().__init__()
    def load_data(
        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
    ) -> List[Document]:
        """Load data using Html reader
        Args:
            file_path (Path): Path to PDF file
            debug_path (Path): Path to store debug image output
            artifact_path (Path): Path to OCR endpoints artifacts directory
        Returns:
            List[Document]: list of documents extracted from the HTML file
        """
        file_path = Path(file_path).resolve()
        with file_path.open("r") as content:
            html_text = "".join(
                [
                    unicodedata.normalize("NFKC", line[:-1])
                    for line in content.readlines()
                ]
            )
        # read HTML
        all_text = self._module.html2text(html_text)
        pages = (
            all_text.split(self._page_break_pattern)
            if self._page_break_pattern
            else [all_text]
        )
        extra_info = extra_info or {}
        # create Document from non-table text
        documents = [
            Document(
                text=page.strip(),
                metadata={"page_label": page_id + 1, **extra_info},
            )
            for page_id, page in enumerate(pages)
        ]
        return documents
--- a/libs/kotaemon/pyproject.toml
+++ b/libs/kotaemon/pyproject.toml
@@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
 # metadata and dependencies
 [project]
 name = "kotaemon"
-version = "0.3.5"
+version = "0.3.6"
 requires-python = ">= 3.10"
 description = "Kotaemon core library for AI development."
 dependencies = [
@@ -55,6 +55,7 @@ dev = [
    "wikipedia",
    "duckduckgo-search",
    "googlesearch-python",
    "python-docx",
    "python-dotenv",
    "pytest-mock",
    "unstructured[pdf]",
@@ -62,6 +63,7 @@ dev = [
    "cohere",
    "elasticsearch",
    "pypdf",
    "html2text",
 ]
 [project.scripts]
--- a/libs/kotaemon/tests/resources/dummy.docx
+++ b/libs/kotaemon/tests/resources/dummy.docx
--- a/libs/kotaemon/tests/resources/html/dummy.html
+++ b/libs/kotaemon/tests/resources/html/dummy.html
--- a/libs/kotaemon/tests/resources/html/dummy_image.png
+++ b/libs/kotaemon/tests/resources/html/dummy_image.png
--- a/libs/kotaemon/tests/test_reader.py
+++ b/libs/kotaemon/tests/test_reader.py
@@ -4,13 +4,29 @@ from langchain.schema import Document as LangchainDocument
 from llama_index.node_parser import SimpleNodeParser
 from kotaemon.base import Document
-from kotaemon.loaders import AutoReader, UnstructuredReader
+from kotaemon.loaders import AutoReader, DocxReader, HtmlReader, UnstructuredReader
 def test_docx_reader():
    reader = DocxReader()
    documents = reader.load_data(Path(__file__).parent / "resources" / "dummy.docx")
    assert len(documents)
 def test_html_reader():
    reader = HtmlReader()
    documents = reader.load_data(
        Path(__file__).parent / "resources" / "html" / "dummy.html"
    )
    assert len(documents)
 def test_pdf_reader():
    reader = AutoReader("PDFReader")
    dirpath = Path(__file__).parent
-    documents = reader.load_data(dirpath / "resources/dummy.pdf")
+    documents = reader.load_data(dirpath / "resources" / "dummy.pdf")
    # check document reader output
    assert len(documents) == 1