Add docx + html reader (#139)

This commit is contained in:
Duc Nguyen (john) 2024-01-31 19:21:30 +07:00 committed by GitHub
parent 116919b346
commit 65852b7d71
10 changed files with 202 additions and 10 deletions

View File

@ -88,6 +88,7 @@ jobs:
steps.check-cache-hit.outputs.check != 'true'
run: |
python -m pip install --upgrade pip
cd libs/kotaemon
pip install -U --upgrade-strategy eager -e .[dev]
- name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}

View File

@ -26,13 +26,6 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
```shell
git clone git@github.com:Cinnamon/kotaemon.git
cd kotaemon
```
- Install all
```shell
pip install -e ".[dev]"
```
- Pre-commit
@ -41,6 +34,13 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
pre-commit install
```
- Install all
```shell
cd kotaemon/libs/kotaemon
pip install -e ".[dev]"
```
- Test
```shell

View File

@ -1,5 +1,7 @@
from .base import AutoReader, DirectoryReader
from .docx_loader import DocxReader
from .excel_loader import PandasExcelReader
from .html_loader import HtmlReader
from .mathpix_loader import MathpixPDFReader
from .ocr_loader import OCRReader
from .unstructured_loader import UnstructuredReader
@ -11,4 +13,6 @@ __all__ = [
"OCRReader",
"DirectoryReader",
"UnstructuredReader",
"DocxReader",
"HtmlReader",
]

View File

@ -0,0 +1,91 @@
import unicodedata
from pathlib import Path
from typing import List, Optional
import pandas as pd
from llama_index.readers.base import BaseReader
from kotaemon.base import Document
class DocxReader(BaseReader):
"""Read Docx files that respect table, using python-docx library
Reader behavior:
- All paragraphs are extracted as a Document
- Each table is extracted as a Document, rendered as a CSV string
- The output is a list of Documents, concatenating the above
(tables + paragraphs)
"""
def __init__(self, *args, **kwargs):
try:
import docx
except ImportError:
raise ImportError(
"docx is not installed. "
"Please install it using `pip install python-docx`"
)
self._module = docx
def load_data(
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
"""Load data using Docx reader
Args:
file_path (Path): Path to PDF file
Returns:
List[Document]: list of documents extracted from the HTML file
"""
file_path = Path(file_path).resolve()
doc = self._module.Document(str(file_path))
all_text = "\n".join(
[unicodedata.normalize("NFKC", p.text) for p in doc.paragraphs]
)
pages = [all_text] # 1 page only
tables = []
for t in doc.tables:
arrays = [
[
unicodedata.normalize("NFKC", t.cell(i, j).text)
for i in range(len(t.rows))
]
for j in range(len(t.columns))
]
tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))
extra_info = extra_info or {}
# create output Document with metadata from table
documents = [
Document(
text=table.to_csv(
index=False
).strip(), # strip_special_chars_markdown()
metadata={
"table_origin": table.to_csv(index=False),
"type": "table",
**extra_info,
},
metadata_template="",
metadata_seperator="",
)
for table in tables # page_id
]
# create Document from non-table text
documents.extend(
[
Document(
text=non_table_text.strip(),
metadata={"page_label": 1, **extra_info},
)
for _, non_table_text in enumerate(pages)
]
)
return documents

View File

@ -0,0 +1,77 @@
import unicodedata
from pathlib import Path
from typing import List, Optional
from llama_index.readers.base import BaseReader
from kotaemon.base import Document
class HtmlReader(BaseReader):
"""Reader HTML usimg html2text
Reader behavior:
- HTML is read with html2text.
- All of the texts will be split by `page_break_pattern`
- Each page is extracted as a Document
- The output is a list of Documents
Args:
page_break_pattern (str): Pattern to split the HTML into pages
"""
def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):
try:
import html2text
except ImportError:
raise ImportError(
"html2text is not installed. "
"Please install it using `pip install html2text`"
)
self._module = html2text
self._page_break_pattern: Optional[str] = page_break_pattern
super().__init__()
def load_data(
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
"""Load data using Html reader
Args:
file_path (Path): Path to PDF file
debug_path (Path): Path to store debug image output
artifact_path (Path): Path to OCR endpoints artifacts directory
Returns:
List[Document]: list of documents extracted from the HTML file
"""
file_path = Path(file_path).resolve()
with file_path.open("r") as content:
html_text = "".join(
[
unicodedata.normalize("NFKC", line[:-1])
for line in content.readlines()
]
)
# read HTML
all_text = self._module.html2text(html_text)
pages = (
all_text.split(self._page_break_pattern)
if self._page_break_pattern
else [all_text]
)
extra_info = extra_info or {}
# create Document from non-table text
documents = [
Document(
text=page.strip(),
metadata={"page_label": page_id + 1, **extra_info},
)
for page_id, page in enumerate(pages)
]
return documents

View File

@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
# metadata and dependencies
[project]
name = "kotaemon"
version = "0.3.5"
version = "0.3.6"
requires-python = ">= 3.10"
description = "Kotaemon core library for AI development."
dependencies = [
@ -55,6 +55,7 @@ dev = [
"wikipedia",
"duckduckgo-search",
"googlesearch-python",
"python-docx",
"python-dotenv",
"pytest-mock",
"unstructured[pdf]",
@ -62,6 +63,7 @@ dev = [
"cohere",
"elasticsearch",
"pypdf",
"html2text",
]
[project.scripts]

Binary file not shown.

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

View File

@ -4,13 +4,29 @@ from langchain.schema import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser
from kotaemon.base import Document
from kotaemon.loaders import AutoReader, UnstructuredReader
from kotaemon.loaders import AutoReader, DocxReader, HtmlReader, UnstructuredReader
def test_docx_reader():
reader = DocxReader()
documents = reader.load_data(Path(__file__).parent / "resources" / "dummy.docx")
assert len(documents)
def test_html_reader():
reader = HtmlReader()
documents = reader.load_data(
Path(__file__).parent / "resources" / "html" / "dummy.html"
)
assert len(documents)
def test_pdf_reader():
reader = AutoReader("PDFReader")
dirpath = Path(__file__).parent
documents = reader.load_data(dirpath / "resources/dummy.pdf")
documents = reader.load_data(dirpath / "resources" / "dummy.pdf")
# check document reader output
assert len(documents) == 1