Add docx + html reader (#139)
This commit is contained in:
parent
116919b346
commit
65852b7d71
1
.github/workflows/unit-test.yaml
vendored
1
.github/workflows/unit-test.yaml
vendored
|
@ -88,6 +88,7 @@ jobs:
|
||||||
steps.check-cache-hit.outputs.check != 'true'
|
steps.check-cache-hit.outputs.check != 'true'
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
|
cd libs/kotaemon
|
||||||
pip install -U --upgrade-strategy eager -e .[dev]
|
pip install -U --upgrade-strategy eager -e .[dev]
|
||||||
|
|
||||||
- name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
|
- name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
|
||||||
|
|
14
README.md
14
README.md
|
@ -26,13 +26,6 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git clone git@github.com:Cinnamon/kotaemon.git
|
git clone git@github.com:Cinnamon/kotaemon.git
|
||||||
cd kotaemon
|
|
||||||
```
|
|
||||||
|
|
||||||
- Install all
|
|
||||||
|
|
||||||
```shell
|
|
||||||
pip install -e ".[dev]"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
- Pre-commit
|
- Pre-commit
|
||||||
|
@ -41,6 +34,13 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
|
||||||
pre-commit install
|
pre-commit install
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- Install all
|
||||||
|
|
||||||
|
```shell
|
||||||
|
cd kotaemon/libs/kotaemon
|
||||||
|
pip install -e ".[dev]"
|
||||||
|
```
|
||||||
|
|
||||||
- Test
|
- Test
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
from .base import AutoReader, DirectoryReader
|
from .base import AutoReader, DirectoryReader
|
||||||
|
from .docx_loader import DocxReader
|
||||||
from .excel_loader import PandasExcelReader
|
from .excel_loader import PandasExcelReader
|
||||||
|
from .html_loader import HtmlReader
|
||||||
from .mathpix_loader import MathpixPDFReader
|
from .mathpix_loader import MathpixPDFReader
|
||||||
from .ocr_loader import OCRReader
|
from .ocr_loader import OCRReader
|
||||||
from .unstructured_loader import UnstructuredReader
|
from .unstructured_loader import UnstructuredReader
|
||||||
|
@ -11,4 +13,6 @@ __all__ = [
|
||||||
"OCRReader",
|
"OCRReader",
|
||||||
"DirectoryReader",
|
"DirectoryReader",
|
||||||
"UnstructuredReader",
|
"UnstructuredReader",
|
||||||
|
"DocxReader",
|
||||||
|
"HtmlReader",
|
||||||
]
|
]
|
||||||
|
|
91
libs/kotaemon/kotaemon/loaders/docx_loader.py
Normal file
91
libs/kotaemon/kotaemon/loaders/docx_loader.py
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
import unicodedata
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from llama_index.readers.base import BaseReader
|
||||||
|
|
||||||
|
from kotaemon.base import Document
|
||||||
|
|
||||||
|
|
||||||
|
class DocxReader(BaseReader):
|
||||||
|
"""Read Docx files that respect table, using python-docx library
|
||||||
|
|
||||||
|
Reader behavior:
|
||||||
|
- All paragraphs are extracted as a Document
|
||||||
|
- Each table is extracted as a Document, rendered as a CSV string
|
||||||
|
- The output is a list of Documents, concatenating the above
|
||||||
|
(tables + paragraphs)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
try:
|
||||||
|
import docx
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"docx is not installed. "
|
||||||
|
"Please install it using `pip install python-docx`"
|
||||||
|
)
|
||||||
|
self._module = docx
|
||||||
|
|
||||||
|
def load_data(
|
||||||
|
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Load data using Docx reader
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (Path): Path to PDF file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: list of documents extracted from the HTML file
|
||||||
|
"""
|
||||||
|
file_path = Path(file_path).resolve()
|
||||||
|
|
||||||
|
doc = self._module.Document(str(file_path))
|
||||||
|
all_text = "\n".join(
|
||||||
|
[unicodedata.normalize("NFKC", p.text) for p in doc.paragraphs]
|
||||||
|
)
|
||||||
|
pages = [all_text] # 1 page only
|
||||||
|
|
||||||
|
tables = []
|
||||||
|
for t in doc.tables:
|
||||||
|
arrays = [
|
||||||
|
[
|
||||||
|
unicodedata.normalize("NFKC", t.cell(i, j).text)
|
||||||
|
for i in range(len(t.rows))
|
||||||
|
]
|
||||||
|
for j in range(len(t.columns))
|
||||||
|
]
|
||||||
|
tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))
|
||||||
|
|
||||||
|
extra_info = extra_info or {}
|
||||||
|
|
||||||
|
# create output Document with metadata from table
|
||||||
|
documents = [
|
||||||
|
Document(
|
||||||
|
text=table.to_csv(
|
||||||
|
index=False
|
||||||
|
).strip(), # strip_special_chars_markdown()
|
||||||
|
metadata={
|
||||||
|
"table_origin": table.to_csv(index=False),
|
||||||
|
"type": "table",
|
||||||
|
**extra_info,
|
||||||
|
},
|
||||||
|
metadata_template="",
|
||||||
|
metadata_seperator="",
|
||||||
|
)
|
||||||
|
for table in tables # page_id
|
||||||
|
]
|
||||||
|
|
||||||
|
# create Document from non-table text
|
||||||
|
documents.extend(
|
||||||
|
[
|
||||||
|
Document(
|
||||||
|
text=non_table_text.strip(),
|
||||||
|
metadata={"page_label": 1, **extra_info},
|
||||||
|
)
|
||||||
|
for _, non_table_text in enumerate(pages)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return documents
|
77
libs/kotaemon/kotaemon/loaders/html_loader.py
Normal file
77
libs/kotaemon/kotaemon/loaders/html_loader.py
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
import unicodedata
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from llama_index.readers.base import BaseReader
|
||||||
|
|
||||||
|
from kotaemon.base import Document
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlReader(BaseReader):
|
||||||
|
"""Reader HTML usimg html2text
|
||||||
|
|
||||||
|
Reader behavior:
|
||||||
|
- HTML is read with html2text.
|
||||||
|
- All of the texts will be split by `page_break_pattern`
|
||||||
|
- Each page is extracted as a Document
|
||||||
|
- The output is a list of Documents
|
||||||
|
|
||||||
|
Args:
|
||||||
|
page_break_pattern (str): Pattern to split the HTML into pages
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):
|
||||||
|
try:
|
||||||
|
import html2text
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"html2text is not installed. "
|
||||||
|
"Please install it using `pip install html2text`"
|
||||||
|
)
|
||||||
|
|
||||||
|
self._module = html2text
|
||||||
|
self._page_break_pattern: Optional[str] = page_break_pattern
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def load_data(
|
||||||
|
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Load data using Html reader
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (Path): Path to PDF file
|
||||||
|
debug_path (Path): Path to store debug image output
|
||||||
|
artifact_path (Path): Path to OCR endpoints artifacts directory
|
||||||
|
Returns:
|
||||||
|
List[Document]: list of documents extracted from the HTML file
|
||||||
|
"""
|
||||||
|
file_path = Path(file_path).resolve()
|
||||||
|
|
||||||
|
with file_path.open("r") as content:
|
||||||
|
html_text = "".join(
|
||||||
|
[
|
||||||
|
unicodedata.normalize("NFKC", line[:-1])
|
||||||
|
for line in content.readlines()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# read HTML
|
||||||
|
all_text = self._module.html2text(html_text)
|
||||||
|
pages = (
|
||||||
|
all_text.split(self._page_break_pattern)
|
||||||
|
if self._page_break_pattern
|
||||||
|
else [all_text]
|
||||||
|
)
|
||||||
|
|
||||||
|
extra_info = extra_info or {}
|
||||||
|
|
||||||
|
# create Document from non-table text
|
||||||
|
documents = [
|
||||||
|
Document(
|
||||||
|
text=page.strip(),
|
||||||
|
metadata={"page_label": page_id + 1, **extra_info},
|
||||||
|
)
|
||||||
|
for page_id, page in enumerate(pages)
|
||||||
|
]
|
||||||
|
|
||||||
|
return documents
|
|
@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
|
||||||
# metadata and dependencies
|
# metadata and dependencies
|
||||||
[project]
|
[project]
|
||||||
name = "kotaemon"
|
name = "kotaemon"
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
requires-python = ">= 3.10"
|
requires-python = ">= 3.10"
|
||||||
description = "Kotaemon core library for AI development."
|
description = "Kotaemon core library for AI development."
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
@ -55,6 +55,7 @@ dev = [
|
||||||
"wikipedia",
|
"wikipedia",
|
||||||
"duckduckgo-search",
|
"duckduckgo-search",
|
||||||
"googlesearch-python",
|
"googlesearch-python",
|
||||||
|
"python-docx",
|
||||||
"python-dotenv",
|
"python-dotenv",
|
||||||
"pytest-mock",
|
"pytest-mock",
|
||||||
"unstructured[pdf]",
|
"unstructured[pdf]",
|
||||||
|
@ -62,6 +63,7 @@ dev = [
|
||||||
"cohere",
|
"cohere",
|
||||||
"elasticsearch",
|
"elasticsearch",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"html2text",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|
BIN
libs/kotaemon/tests/resources/dummy.docx
Normal file
BIN
libs/kotaemon/tests/resources/dummy.docx
Normal file
Binary file not shown.
1
libs/kotaemon/tests/resources/html/dummy.html
Normal file
1
libs/kotaemon/tests/resources/html/dummy.html
Normal file
File diff suppressed because one or more lines are too long
BIN
libs/kotaemon/tests/resources/html/dummy_image.png
Normal file
BIN
libs/kotaemon/tests/resources/html/dummy_image.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.3 KiB |
|
@ -4,13 +4,29 @@ from langchain.schema import Document as LangchainDocument
|
||||||
from llama_index.node_parser import SimpleNodeParser
|
from llama_index.node_parser import SimpleNodeParser
|
||||||
|
|
||||||
from kotaemon.base import Document
|
from kotaemon.base import Document
|
||||||
from kotaemon.loaders import AutoReader, UnstructuredReader
|
from kotaemon.loaders import AutoReader, DocxReader, HtmlReader, UnstructuredReader
|
||||||
|
|
||||||
|
|
||||||
|
def test_docx_reader():
|
||||||
|
reader = DocxReader()
|
||||||
|
documents = reader.load_data(Path(__file__).parent / "resources" / "dummy.docx")
|
||||||
|
|
||||||
|
assert len(documents)
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_reader():
|
||||||
|
reader = HtmlReader()
|
||||||
|
documents = reader.load_data(
|
||||||
|
Path(__file__).parent / "resources" / "html" / "dummy.html"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(documents)
|
||||||
|
|
||||||
|
|
||||||
def test_pdf_reader():
|
def test_pdf_reader():
|
||||||
reader = AutoReader("PDFReader")
|
reader = AutoReader("PDFReader")
|
||||||
dirpath = Path(__file__).parent
|
dirpath = Path(__file__).parent
|
||||||
documents = reader.load_data(dirpath / "resources/dummy.pdf")
|
documents = reader.load_data(dirpath / "resources" / "dummy.pdf")
|
||||||
|
|
||||||
# check document reader output
|
# check document reader output
|
||||||
assert len(documents) == 1
|
assert len(documents) == 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user