Add docx + html reader (#139)
This commit is contained in:
parent
116919b346
commit
65852b7d71
1
.github/workflows/unit-test.yaml
vendored
1
.github/workflows/unit-test.yaml
vendored
|
@ -88,6 +88,7 @@ jobs:
|
|||
steps.check-cache-hit.outputs.check != 'true'
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
cd libs/kotaemon
|
||||
pip install -U --upgrade-strategy eager -e .[dev]
|
||||
|
||||
- name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
|
||||
|
|
14
README.md
14
README.md
|
@ -26,13 +26,6 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
|
|||
|
||||
```shell
|
||||
git clone git@github.com:Cinnamon/kotaemon.git
|
||||
cd kotaemon
|
||||
```
|
||||
|
||||
- Install all
|
||||
|
||||
```shell
|
||||
pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
- Pre-commit
|
||||
|
@ -41,6 +34,13 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
|
|||
pre-commit install
|
||||
```
|
||||
|
||||
- Install all
|
||||
|
||||
```shell
|
||||
cd kotaemon/libs/kotaemon
|
||||
pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
- Test
|
||||
|
||||
```shell
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from .base import AutoReader, DirectoryReader
|
||||
from .docx_loader import DocxReader
|
||||
from .excel_loader import PandasExcelReader
|
||||
from .html_loader import HtmlReader
|
||||
from .mathpix_loader import MathpixPDFReader
|
||||
from .ocr_loader import OCRReader
|
||||
from .unstructured_loader import UnstructuredReader
|
||||
|
@ -11,4 +13,6 @@ __all__ = [
|
|||
"OCRReader",
|
||||
"DirectoryReader",
|
||||
"UnstructuredReader",
|
||||
"DocxReader",
|
||||
"HtmlReader",
|
||||
]
|
||||
|
|
91
libs/kotaemon/kotaemon/loaders/docx_loader.py
Normal file
91
libs/kotaemon/kotaemon/loaders/docx_loader.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from llama_index.readers.base import BaseReader
|
||||
|
||||
from kotaemon.base import Document
|
||||
|
||||
|
||||
class DocxReader(BaseReader):
|
||||
"""Read Docx files that respect table, using python-docx library
|
||||
|
||||
Reader behavior:
|
||||
- All paragraphs are extracted as a Document
|
||||
- Each table is extracted as a Document, rendered as a CSV string
|
||||
- The output is a list of Documents, concatenating the above
|
||||
(tables + paragraphs)
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
try:
|
||||
import docx
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"docx is not installed. "
|
||||
"Please install it using `pip install python-docx`"
|
||||
)
|
||||
self._module = docx
|
||||
|
||||
def load_data(
|
||||
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
|
||||
) -> List[Document]:
|
||||
"""Load data using Docx reader
|
||||
|
||||
Args:
|
||||
file_path (Path): Path to PDF file
|
||||
|
||||
Returns:
|
||||
List[Document]: list of documents extracted from the HTML file
|
||||
"""
|
||||
file_path = Path(file_path).resolve()
|
||||
|
||||
doc = self._module.Document(str(file_path))
|
||||
all_text = "\n".join(
|
||||
[unicodedata.normalize("NFKC", p.text) for p in doc.paragraphs]
|
||||
)
|
||||
pages = [all_text] # 1 page only
|
||||
|
||||
tables = []
|
||||
for t in doc.tables:
|
||||
arrays = [
|
||||
[
|
||||
unicodedata.normalize("NFKC", t.cell(i, j).text)
|
||||
for i in range(len(t.rows))
|
||||
]
|
||||
for j in range(len(t.columns))
|
||||
]
|
||||
tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))
|
||||
|
||||
extra_info = extra_info or {}
|
||||
|
||||
# create output Document with metadata from table
|
||||
documents = [
|
||||
Document(
|
||||
text=table.to_csv(
|
||||
index=False
|
||||
).strip(), # strip_special_chars_markdown()
|
||||
metadata={
|
||||
"table_origin": table.to_csv(index=False),
|
||||
"type": "table",
|
||||
**extra_info,
|
||||
},
|
||||
metadata_template="",
|
||||
metadata_seperator="",
|
||||
)
|
||||
for table in tables # page_id
|
||||
]
|
||||
|
||||
# create Document from non-table text
|
||||
documents.extend(
|
||||
[
|
||||
Document(
|
||||
text=non_table_text.strip(),
|
||||
metadata={"page_label": 1, **extra_info},
|
||||
)
|
||||
for _, non_table_text in enumerate(pages)
|
||||
]
|
||||
)
|
||||
|
||||
return documents
|
77
libs/kotaemon/kotaemon/loaders/html_loader.py
Normal file
77
libs/kotaemon/kotaemon/loaders/html_loader.py
Normal file
|
@ -0,0 +1,77 @@
|
|||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from llama_index.readers.base import BaseReader
|
||||
|
||||
from kotaemon.base import Document
|
||||
|
||||
|
||||
class HtmlReader(BaseReader):
|
||||
"""Reader HTML usimg html2text
|
||||
|
||||
Reader behavior:
|
||||
- HTML is read with html2text.
|
||||
- All of the texts will be split by `page_break_pattern`
|
||||
- Each page is extracted as a Document
|
||||
- The output is a list of Documents
|
||||
|
||||
Args:
|
||||
page_break_pattern (str): Pattern to split the HTML into pages
|
||||
"""
|
||||
|
||||
def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):
|
||||
try:
|
||||
import html2text
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"html2text is not installed. "
|
||||
"Please install it using `pip install html2text`"
|
||||
)
|
||||
|
||||
self._module = html2text
|
||||
self._page_break_pattern: Optional[str] = page_break_pattern
|
||||
super().__init__()
|
||||
|
||||
def load_data(
|
||||
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
|
||||
) -> List[Document]:
|
||||
"""Load data using Html reader
|
||||
|
||||
Args:
|
||||
file_path (Path): Path to PDF file
|
||||
debug_path (Path): Path to store debug image output
|
||||
artifact_path (Path): Path to OCR endpoints artifacts directory
|
||||
Returns:
|
||||
List[Document]: list of documents extracted from the HTML file
|
||||
"""
|
||||
file_path = Path(file_path).resolve()
|
||||
|
||||
with file_path.open("r") as content:
|
||||
html_text = "".join(
|
||||
[
|
||||
unicodedata.normalize("NFKC", line[:-1])
|
||||
for line in content.readlines()
|
||||
]
|
||||
)
|
||||
|
||||
# read HTML
|
||||
all_text = self._module.html2text(html_text)
|
||||
pages = (
|
||||
all_text.split(self._page_break_pattern)
|
||||
if self._page_break_pattern
|
||||
else [all_text]
|
||||
)
|
||||
|
||||
extra_info = extra_info or {}
|
||||
|
||||
# create Document from non-table text
|
||||
documents = [
|
||||
Document(
|
||||
text=page.strip(),
|
||||
metadata={"page_label": page_id + 1, **extra_info},
|
||||
)
|
||||
for page_id, page in enumerate(pages)
|
||||
]
|
||||
|
||||
return documents
|
|
@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
|
|||
# metadata and dependencies
|
||||
[project]
|
||||
name = "kotaemon"
|
||||
version = "0.3.5"
|
||||
version = "0.3.6"
|
||||
requires-python = ">= 3.10"
|
||||
description = "Kotaemon core library for AI development."
|
||||
dependencies = [
|
||||
|
@ -55,6 +55,7 @@ dev = [
|
|||
"wikipedia",
|
||||
"duckduckgo-search",
|
||||
"googlesearch-python",
|
||||
"python-docx",
|
||||
"python-dotenv",
|
||||
"pytest-mock",
|
||||
"unstructured[pdf]",
|
||||
|
@ -62,6 +63,7 @@ dev = [
|
|||
"cohere",
|
||||
"elasticsearch",
|
||||
"pypdf",
|
||||
"html2text",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
|
BIN
libs/kotaemon/tests/resources/dummy.docx
Normal file
BIN
libs/kotaemon/tests/resources/dummy.docx
Normal file
Binary file not shown.
1
libs/kotaemon/tests/resources/html/dummy.html
Normal file
1
libs/kotaemon/tests/resources/html/dummy.html
Normal file
File diff suppressed because one or more lines are too long
BIN
libs/kotaemon/tests/resources/html/dummy_image.png
Normal file
BIN
libs/kotaemon/tests/resources/html/dummy_image.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.3 KiB |
|
@ -4,13 +4,29 @@ from langchain.schema import Document as LangchainDocument
|
|||
from llama_index.node_parser import SimpleNodeParser
|
||||
|
||||
from kotaemon.base import Document
|
||||
from kotaemon.loaders import AutoReader, UnstructuredReader
|
||||
from kotaemon.loaders import AutoReader, DocxReader, HtmlReader, UnstructuredReader
|
||||
|
||||
|
||||
def test_docx_reader():
|
||||
reader = DocxReader()
|
||||
documents = reader.load_data(Path(__file__).parent / "resources" / "dummy.docx")
|
||||
|
||||
assert len(documents)
|
||||
|
||||
|
||||
def test_html_reader():
|
||||
reader = HtmlReader()
|
||||
documents = reader.load_data(
|
||||
Path(__file__).parent / "resources" / "html" / "dummy.html"
|
||||
)
|
||||
|
||||
assert len(documents)
|
||||
|
||||
|
||||
def test_pdf_reader():
|
||||
reader = AutoReader("PDFReader")
|
||||
dirpath = Path(__file__).parent
|
||||
documents = reader.load_data(dirpath / "resources/dummy.pdf")
|
||||
documents = reader.load_data(dirpath / "resources" / "dummy.pdf")
|
||||
|
||||
# check document reader output
|
||||
assert len(documents) == 1
|
||||
|
|
Loading…
Reference in New Issue
Block a user