diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py index 0f80cb2..e5e325e 100644 --- a/libs/kotaemon/kotaemon/indices/ingests/files.py +++ b/libs/kotaemon/kotaemon/indices/ingests/files.py @@ -9,7 +9,9 @@ from kotaemon.indices.splitters import BaseSplitter, TokenSplitter from kotaemon.loaders import ( AdobeReader, DirectoryReader, + HtmlReader, MathpixPDFReader, + MhtmlReader, OCRReader, PandasExcelReader, UnstructuredReader, @@ -20,6 +22,13 @@ KH_DEFAULT_FILE_EXTRACTORS: dict[str, Type[BaseReader]] = { ".docx": UnstructuredReader, ".xls": UnstructuredReader, ".doc": UnstructuredReader, + ".html": HtmlReader, + ".mhtml": MhtmlReader, + ".png": UnstructuredReader, + ".jpeg": UnstructuredReader, + ".jpg": UnstructuredReader, + ".tiff": UnstructuredReader, + ".tif": UnstructuredReader, } diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py index a59d713..f0e7d0f 100644 --- a/libs/kotaemon/kotaemon/loaders/__init__.py +++ b/libs/kotaemon/kotaemon/loaders/__init__.py @@ -3,7 +3,7 @@ from .base import AutoReader, BaseReader from .composite_loader import DirectoryReader from .docx_loader import DocxReader from .excel_loader import PandasExcelReader -from .html_loader import HtmlReader +from .html_loader import HtmlReader, MhtmlReader from .mathpix_loader import MathpixPDFReader from .ocr_loader import ImageReader, OCRReader from .unstructured_loader import UnstructuredReader @@ -19,5 +19,6 @@ __all__ = [ "UnstructuredReader", "DocxReader", "HtmlReader", + "MhtmlReader", "AdobeReader", ] diff --git a/libs/kotaemon/kotaemon/loaders/html_loader.py b/libs/kotaemon/kotaemon/loaders/html_loader.py index 1295cfc..c939c8a 100644 --- a/libs/kotaemon/kotaemon/loaders/html_loader.py +++ b/libs/kotaemon/kotaemon/loaders/html_loader.py @@ -1,5 +1,6 @@ +import email from pathlib import Path -from typing import List, Optional +from typing import Optional from llama_index.readers.base import BaseReader @@ -33,7 +34,7 @@ class HtmlReader(BaseReader): def load_data( self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs - ) -> List[Document]: + ) -> list[Document]: """Load data using Html reader Args: @@ -70,3 +71,78 @@ class HtmlReader(BaseReader): ] return documents + + +class MhtmlReader(BaseReader): + """Parse `MHTML` files with `BeautifulSoup`.""" + + def __init__( + self, + open_encoding: Optional[str] = None, + bs_kwargs: Optional[dict] = None, + get_text_separator: str = "", + ) -> None: + """initialize with path, and optionally, file encoding to use, and any kwargs + to pass to the BeautifulSoup object. + + Args: + file_path: Path to file to load. + open_encoding: The encoding to use when opening the file. + bs_kwargs: Any kwargs to pass to the BeautifulSoup object. + get_text_separator: The separator to use when getting the text + from the soup. + """ + try: + import bs4 # noqa:F401 + except ImportError: + raise ImportError( + "beautifulsoup4 package not found, please install it with " + "`pip install beautifulsoup4`" + ) + + self.open_encoding = open_encoding + if bs_kwargs is None: + bs_kwargs = {"features": "lxml"} + self.bs_kwargs = bs_kwargs + self.get_text_separator = get_text_separator + + def load_data( + self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs + ) -> list[Document]: + """Load MHTML document into document objects.""" + + from bs4 import BeautifulSoup + + extra_info = extra_info or {} + metadata: dict = extra_info + page = [] + with open(file_path, "r", encoding=self.open_encoding) as f: + message = email.message_from_string(f.read()) + parts = message.get_payload() + + if not isinstance(parts, list): + parts = [message] + + for part in parts: + if part.get_content_type() == "text/html": + html = part.get_payload(decode=True).decode() + + soup = BeautifulSoup(html, **self.bs_kwargs) + text = soup.get_text(self.get_text_separator) + + if soup.title: + title = str(soup.title.string) + else: + title = "" + + metadata = { + "source": str(file_path), + "title": title, + **extra_info, + } + lines = [line for line in text.split("\n") if line.strip()] + text = "\n\n".join(lines) + if text: + page.append(text) + + return [Document(text="\n\n".join(page), metadata=metadata)] diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index 2fd5ad7..43678b4 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"] # metadata and dependencies [project] name = "kotaemon" -version = "0.3.10" +version = "0.3.11" requires-python = ">= 3.10" description = "Kotaemon core library for AI development." dependencies = [ @@ -63,6 +63,7 @@ adv = [ "llama-cpp-python", "pdfservices-sdk @ git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements", "fastembed", + "beautifulsoup4", ] dev = [ "ipython", diff --git a/libs/kotaemon/tests/resources/dummy.mhtml b/libs/kotaemon/tests/resources/dummy.mhtml new file mode 100644 index 0000000..86c1be6 --- /dev/null +++ b/libs/kotaemon/tests/resources/dummy.mhtml @@ -0,0 +1,690 @@ +MIME-Version: 1.0 +Content-Type: multipart/related; boundary="----=_NextPart_01CF5AE5.5C24CD00" + +This document is a Single File Web Page, also known as a Web Archive file. If you are seeing this message, your browser or editor doesn't support Web Archive files. Please download a browser that supports Web Archive, such as Windows® Internet Explorer®. + +------=_NextPart_01CF5AE5.5C24CD00 +Content-Location: file:///C:/D16BB227/testing.htm +Content-Transfer-Encoding: quoted-printable +Content-Type: text/html; charset="us-ascii" + + + +
+ + + + + + + + + + + + + + + +This is a test.
+ +This is bold,= + italic, and underlined.= +
+ +asdakl fskljf +sklf jkslaf; djks dlkfa sk +sdjkl ksjkl jsjk skdjjks i +w ie sjkfksd fjisdf jks fjs +kdj fsk dfjskd +fjskd fjsd kfjsk f jskdf jskd +fjsk dfjskdf jsifj sifj sk +fjks fjksd fjskdf kjs jdfksk +fdjs fksj fks dfjs dfks +fdjsk fjskdfjskdf <= +span +class=3DSpellE>sjkf skjf sjkdf +skfjsfjk s
+ +The end.
+ +