From d9e925eb7557c61f55e907c3e1eab431aca9d7dd Mon Sep 17 00:00:00 2001 From: "Tuan Anh Nguyen Dang (Tadashi_Cin)" Date: Tue, 5 Dec 2023 16:19:13 +0700 Subject: [PATCH] Add UnstructuredReader with support for various legacy files (.doc, .xls) (#99) --- knowledgehub/loaders/__init__.py | 2 + knowledgehub/loaders/unstructured_loader.py | 110 ++++++++++++++++++++ tests/test_reader.py | 21 +++- 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 knowledgehub/loaders/unstructured_loader.py diff --git a/knowledgehub/loaders/__init__.py b/knowledgehub/loaders/__init__.py index 281ea75..e564c5e 100644 --- a/knowledgehub/loaders/__init__.py +++ b/knowledgehub/loaders/__init__.py @@ -2,6 +2,7 @@ from .base import AutoReader, DirectoryReader from .excel_loader import PandasExcelReader from .mathpix_loader import MathpixPDFReader from .ocr_loader import OCRReader +from .unstructured_loader import UnstructuredReader __all__ = [ "AutoReader", @@ -9,4 +10,5 @@ __all__ = [ "MathpixPDFReader", "OCRReader", "DirectoryReader", + "UnstructuredReader", ] diff --git a/knowledgehub/loaders/unstructured_loader.py b/knowledgehub/loaders/unstructured_loader.py new file mode 100644 index 0000000..3972664 --- /dev/null +++ b/knowledgehub/loaders/unstructured_loader.py @@ -0,0 +1,110 @@ +"""Unstructured file reader. + +A parser for unstructured text files using Unstructured.io. +Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents. + +To use .doc and .xls parser, install + +sudo apt-get install -y libmagic-dev poppler-utils libreoffice +pip install xlrd + +""" +from pathlib import Path +from typing import Any, Dict, List, Optional + +from llama_index.readers.base import BaseReader + +from kotaemon.base import Document + + +class UnstructuredReader(BaseReader): + """General unstructured text reader for a variety of files.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Init params.""" + super().__init__(*args) # not passing kwargs to parent bc it cannot accept it + + self.api = False # we default to local + if "url" in kwargs: + self.server_url = str(kwargs["url"]) + self.api = True # is url was set, switch to api + else: + self.server_url = "http://localhost:8000" + + if "api" in kwargs: + self.api = kwargs["api"] + + self.api_key = "" + if "api_key" in kwargs: + self.api_key = kwargs["api_key"] + + """ Loads data using Unstructured.io + + Depending on the construction if url is set or api = True + it'll parse file using API call, else parse it locally + additional_metadata is extended by the returned metadata if + split_documents is True + + Returns list of documents + """ + + def load_data( + self, + file: Path, + additional_metadata: Optional[Dict] = None, + split_documents: Optional[bool] = False, + **kwargs, + ) -> List[Document]: + """If api is set, parse through api""" + file_path_str = str(file) + if self.api: + from unstructured.partition.api import partition_via_api + + elements = partition_via_api( + filename=file_path_str, + api_key=self.api_key, + api_url=self.server_url + "/general/v0/general", + ) + else: + """Parse file locally""" + from unstructured.partition.auto import partition + + elements = partition(filename=file_path_str) + + """ Process elements """ + docs = [] + file_name = Path(file).name + if split_documents: + for node in elements: + metadata = {"file_name": file_name} + if hasattr(node, "metadata"): + """Load metadata fields""" + for field, val in vars(node.metadata).items(): + if field == "_known_field_names": + continue + # removing coordinates because it does not serialize + # and dont want to bother with it + if field == "coordinates": + continue + # removing bc it might cause interference + if field == "parent_id": + continue + metadata[field] = val + + if additional_metadata is not None: + metadata.update(additional_metadata) + + metadata["file_name"] = file_name + docs.append(Document(text=node.text, metadata=metadata)) + + else: + text_chunks = [" ".join(str(el).split()) for el in elements] + metadata = {"file_name": file_name} + + if additional_metadata is not None: + metadata.update(additional_metadata) + + # Create a single document by joining all the texts + docs.append(Document(text="\n\n".join(text_chunks), metadata=metadata)) + + return docs diff --git a/tests/test_reader.py b/tests/test_reader.py index f65c25a..4231e18 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -4,7 +4,7 @@ from langchain.schema import Document as LangchainDocument from llama_index.node_parser import SimpleNodeParser from kotaemon.base import Document -from kotaemon.loaders import AutoReader +from kotaemon.loaders import AutoReader, UnstructuredReader def test_pdf_reader(): @@ -26,3 +26,22 @@ def test_pdf_reader(): node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20) nodes = node_parser.get_nodes_from_documents(documents) assert len(nodes) > 0 + + +def test_unstructured_pdf_reader(): + reader = UnstructuredReader() + dirpath = Path(__file__).parent + input_path = dirpath / "resources/dummy.pdf" + documents = reader.load_data(input_path) + + # check document reader output + assert len(documents) == 1 + + first_doc = documents[0] + assert isinstance(first_doc, Document) + assert first_doc.text.lower().replace(" ", "") == "dummypdffile" + + # split documents mode + documents = reader.load_data(input_path, split_documents=True) + # check document reader output + assert len(documents) == 1