Add docx + html reader (#139)

2024-01-31 19:21:30 +07:00
parent 116919b346
commit 65852b7d71
10 changed files with 202 additions and 10 deletions
--- a/.github/workflows/unit-test.yaml
+++ b/.github/workflows/unit-test.yaml
@@ -88,6 +88,7 @@ jobs:
          steps.check-cache-hit.outputs.check != 'true'
        run: |
          python -m pip install --upgrade pip
+          cd libs/kotaemon
          pip install -U --upgrade-strategy eager -e .[dev]

      - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
--- a/README.md
+++ b/README.md
@@ -26,13 +26,6 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git

  ```shell
  git clone git@github.com:Cinnamon/kotaemon.git
-  cd kotaemon
-  ```
-
- Install all
-
-  ```shell
-  pip install -e ".[dev]"
  ```

 - Pre-commit
@@ -41,6 +34,13 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
  pre-commit install
  ```

+- Install all
+
+  ```shell
+  cd kotaemon/libs/kotaemon
+  pip install -e ".[dev]"
+  ```
+
 - Test

  ```shell
--- a/libs/kotaemon/kotaemon/loaders/init.py
+++ b/libs/kotaemon/kotaemon/loaders/init.py
@@ -1,5 +1,7 @@
 from .base import AutoReader, DirectoryReader
+from .docx_loader import DocxReader
 from .excel_loader import PandasExcelReader
+from .html_loader import HtmlReader
 from .mathpix_loader import MathpixPDFReader
 from .ocr_loader import OCRReader
 from .unstructured_loader import UnstructuredReader
@@ -11,4 +13,6 @@ __all__ = [
    "OCRReader",
    "DirectoryReader",
    "UnstructuredReader",
+    "DocxReader",
+    "HtmlReader",
 ]
--- a/libs/kotaemon/kotaemon/loaders/docx_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/docx_loader.py
@@ -0,0 +1,91 @@
+import unicodedata
+from pathlib import Path
+from typing import List, Optional
+
+import pandas as pd
+from llama_index.readers.base import BaseReader
+
+from kotaemon.base import Document
+
+
+class DocxReader(BaseReader):
+    """Read Docx files that respect table, using python-docx library
+
+    Reader behavior:
+        - All paragraphs are extracted as a Document
+        - Each table is extracted as a Document, rendered as a CSV string
+        - The output is a list of Documents, concatenating the above
+        (tables + paragraphs)
+    """
+
+    def __init__(self, *args, **kwargs):
+        try:
+            import docx
+        except ImportError:
+            raise ImportError(
+                "docx is not installed. "
+                "Please install it using `pip install python-docx`"
+            )
+        self._module = docx
+
+    def load_data(
+        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> List[Document]:
+        """Load data using Docx reader
+
+        Args:
+            file_path (Path): Path to PDF file
+
+        Returns:
+            List[Document]: list of documents extracted from the HTML file
+        """
+        file_path = Path(file_path).resolve()
+
+        doc = self._module.Document(str(file_path))
+        all_text = "\n".join(
+            [unicodedata.normalize("NFKC", p.text) for p in doc.paragraphs]
+        )
+        pages = [all_text]  # 1 page only
+
+        tables = []
+        for t in doc.tables:
+            arrays = [
+                [
+                    unicodedata.normalize("NFKC", t.cell(i, j).text)
+                    for i in range(len(t.rows))
+                ]
+                for j in range(len(t.columns))
+            ]
+            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))
+
+        extra_info = extra_info or {}
+
+        # create output Document with metadata from table
+        documents = [
+            Document(
+                text=table.to_csv(
+                    index=False
+                ).strip(),  # strip_special_chars_markdown()
+                metadata={
+                    "table_origin": table.to_csv(index=False),
+                    "type": "table",
+                    **extra_info,
+                },
+                metadata_template="",
+                metadata_seperator="",
+            )
+            for table in tables  # page_id
+        ]
+
+        # create Document from non-table text
+        documents.extend(
+            [
+                Document(
+                    text=non_table_text.strip(),
+                    metadata={"page_label": 1, **extra_info},
+                )
+                for _, non_table_text in enumerate(pages)
+            ]
+        )
+
+        return documents
--- a/libs/kotaemon/kotaemon/loaders/html_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/html_loader.py
@@ -0,0 +1,77 @@
+import unicodedata
+from pathlib import Path
+from typing import List, Optional
+
+from llama_index.readers.base import BaseReader
+
+from kotaemon.base import Document
+
+
+class HtmlReader(BaseReader):
+    """Reader HTML usimg html2text
+
+    Reader behavior:
+        - HTML is read with html2text.
+        - All of the texts will be split by `page_break_pattern`
+        - Each page is extracted as a Document
+        - The output is a list of Documents
+
+    Args:
+        page_break_pattern (str): Pattern to split the HTML into pages
+    """
+
+    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):
+        try:
+            import html2text
+        except ImportError:
+            raise ImportError(
+                "html2text is not installed. "
+                "Please install it using `pip install html2text`"
+            )
+
+        self._module = html2text
+        self._page_break_pattern: Optional[str] = page_break_pattern
+        super().__init__()
+
+    def load_data(
+        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> List[Document]:
+        """Load data using Html reader
+
+        Args:
+            file_path (Path): Path to PDF file
+            debug_path (Path): Path to store debug image output
+            artifact_path (Path): Path to OCR endpoints artifacts directory
+        Returns:
+            List[Document]: list of documents extracted from the HTML file
+        """
+        file_path = Path(file_path).resolve()
+
+        with file_path.open("r") as content:
+            html_text = "".join(
+                [
+                    unicodedata.normalize("NFKC", line[:-1])
+                    for line in content.readlines()
+                ]
+            )
+
+        # read HTML
+        all_text = self._module.html2text(html_text)
+        pages = (
+            all_text.split(self._page_break_pattern)
+            if self._page_break_pattern
+            else [all_text]
+        )
+
+        extra_info = extra_info or {}
+
+        # create Document from non-table text
+        documents = [
+            Document(
+                text=page.strip(),
+                metadata={"page_label": page_id + 1, **extra_info},
+            )
+            for page_id, page in enumerate(pages)
+        ]
+
+        return documents
--- a/libs/kotaemon/pyproject.toml
+++ b/libs/kotaemon/pyproject.toml
@@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
 # metadata and dependencies
 [project]
 name = "kotaemon"
-version = "0.3.5"
+version = "0.3.6"
 requires-python = ">= 3.10"
 description = "Kotaemon core library for AI development."
 dependencies = [
@@ -55,6 +55,7 @@ dev = [
    "wikipedia",
    "duckduckgo-search",
    "googlesearch-python",
+    "python-docx",
    "python-dotenv",
    "pytest-mock",
    "unstructured[pdf]",
@@ -62,6 +63,7 @@ dev = [
    "cohere",
    "elasticsearch",
    "pypdf",
+    "html2text",
 ]

 [project.scripts]
--- a/libs/kotaemon/tests/resources/dummy.docx
+++ b/libs/kotaemon/tests/resources/dummy.docx
--- a/libs/kotaemon/tests/resources/html/dummy.html
+++ b/libs/kotaemon/tests/resources/html/dummy.html
--- a/libs/kotaemon/tests/resources/html/dummy_image.png
+++ b/libs/kotaemon/tests/resources/html/dummy_image.png
--- a/libs/kotaemon/tests/test_reader.py
+++ b/libs/kotaemon/tests/test_reader.py
@@ -4,13 +4,29 @@ from langchain.schema import Document as LangchainDocument
 from llama_index.node_parser import SimpleNodeParser

 from kotaemon.base import Document
-from kotaemon.loaders import AutoReader, UnstructuredReader
+from kotaemon.loaders import AutoReader, DocxReader, HtmlReader, UnstructuredReader
+
+
+def test_docx_reader():
+    reader = DocxReader()
+    documents = reader.load_data(Path(__file__).parent / "resources" / "dummy.docx")
+
+    assert len(documents)
+
+
+def test_html_reader():
+    reader = HtmlReader()
+    documents = reader.load_data(
+        Path(__file__).parent / "resources" / "html" / "dummy.html"
+    )
+
+    assert len(documents)


 def test_pdf_reader():
    reader = AutoReader("PDFReader")
    dirpath = Path(__file__).parent
-    documents = reader.load_data(dirpath / "resources/dummy.pdf")
+    documents = reader.load_data(dirpath / "resources" / "dummy.pdf")

    # check document reader output
    assert len(documents) == 1