Add UnstructuredReader with support for various legacy files (.doc, .xls) (#99)
This commit is contained in:
parent
37c744b616
commit
d9e925eb75
|
@ -2,6 +2,7 @@ from .base import AutoReader, DirectoryReader
|
||||||
from .excel_loader import PandasExcelReader
|
from .excel_loader import PandasExcelReader
|
||||||
from .mathpix_loader import MathpixPDFReader
|
from .mathpix_loader import MathpixPDFReader
|
||||||
from .ocr_loader import OCRReader
|
from .ocr_loader import OCRReader
|
||||||
|
from .unstructured_loader import UnstructuredReader
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AutoReader",
|
"AutoReader",
|
||||||
|
@ -9,4 +10,5 @@ __all__ = [
|
||||||
"MathpixPDFReader",
|
"MathpixPDFReader",
|
||||||
"OCRReader",
|
"OCRReader",
|
||||||
"DirectoryReader",
|
"DirectoryReader",
|
||||||
|
"UnstructuredReader",
|
||||||
]
|
]
|
||||||
|
|
110
knowledgehub/loaders/unstructured_loader.py
Normal file
110
knowledgehub/loaders/unstructured_loader.py
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
"""Unstructured file reader.
|
||||||
|
|
||||||
|
A parser for unstructured text files using Unstructured.io.
|
||||||
|
Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents.
|
||||||
|
|
||||||
|
To use .doc and .xls parser, install
|
||||||
|
|
||||||
|
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||||
|
pip install xlrd
|
||||||
|
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from llama_index.readers.base import BaseReader
|
||||||
|
|
||||||
|
from kotaemon.base import Document
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredReader(BaseReader):
|
||||||
|
"""General unstructured text reader for a variety of files."""
|
||||||
|
|
||||||
|
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
||||||
|
"""Init params."""
|
||||||
|
super().__init__(*args) # not passing kwargs to parent bc it cannot accept it
|
||||||
|
|
||||||
|
self.api = False # we default to local
|
||||||
|
if "url" in kwargs:
|
||||||
|
self.server_url = str(kwargs["url"])
|
||||||
|
self.api = True # is url was set, switch to api
|
||||||
|
else:
|
||||||
|
self.server_url = "http://localhost:8000"
|
||||||
|
|
||||||
|
if "api" in kwargs:
|
||||||
|
self.api = kwargs["api"]
|
||||||
|
|
||||||
|
self.api_key = ""
|
||||||
|
if "api_key" in kwargs:
|
||||||
|
self.api_key = kwargs["api_key"]
|
||||||
|
|
||||||
|
""" Loads data using Unstructured.io
|
||||||
|
|
||||||
|
Depending on the construction if url is set or api = True
|
||||||
|
it'll parse file using API call, else parse it locally
|
||||||
|
additional_metadata is extended by the returned metadata if
|
||||||
|
split_documents is True
|
||||||
|
|
||||||
|
Returns list of documents
|
||||||
|
"""
|
||||||
|
|
||||||
|
def load_data(
|
||||||
|
self,
|
||||||
|
file: Path,
|
||||||
|
additional_metadata: Optional[Dict] = None,
|
||||||
|
split_documents: Optional[bool] = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""If api is set, parse through api"""
|
||||||
|
file_path_str = str(file)
|
||||||
|
if self.api:
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
elements = partition_via_api(
|
||||||
|
filename=file_path_str,
|
||||||
|
api_key=self.api_key,
|
||||||
|
api_url=self.server_url + "/general/v0/general",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
"""Parse file locally"""
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
|
||||||
|
elements = partition(filename=file_path_str)
|
||||||
|
|
||||||
|
""" Process elements """
|
||||||
|
docs = []
|
||||||
|
file_name = Path(file).name
|
||||||
|
if split_documents:
|
||||||
|
for node in elements:
|
||||||
|
metadata = {"file_name": file_name}
|
||||||
|
if hasattr(node, "metadata"):
|
||||||
|
"""Load metadata fields"""
|
||||||
|
for field, val in vars(node.metadata).items():
|
||||||
|
if field == "_known_field_names":
|
||||||
|
continue
|
||||||
|
# removing coordinates because it does not serialize
|
||||||
|
# and dont want to bother with it
|
||||||
|
if field == "coordinates":
|
||||||
|
continue
|
||||||
|
# removing bc it might cause interference
|
||||||
|
if field == "parent_id":
|
||||||
|
continue
|
||||||
|
metadata[field] = val
|
||||||
|
|
||||||
|
if additional_metadata is not None:
|
||||||
|
metadata.update(additional_metadata)
|
||||||
|
|
||||||
|
metadata["file_name"] = file_name
|
||||||
|
docs.append(Document(text=node.text, metadata=metadata))
|
||||||
|
|
||||||
|
else:
|
||||||
|
text_chunks = [" ".join(str(el).split()) for el in elements]
|
||||||
|
metadata = {"file_name": file_name}
|
||||||
|
|
||||||
|
if additional_metadata is not None:
|
||||||
|
metadata.update(additional_metadata)
|
||||||
|
|
||||||
|
# Create a single document by joining all the texts
|
||||||
|
docs.append(Document(text="\n\n".join(text_chunks), metadata=metadata))
|
||||||
|
|
||||||
|
return docs
|
|
@ -4,7 +4,7 @@ from langchain.schema import Document as LangchainDocument
|
||||||
from llama_index.node_parser import SimpleNodeParser
|
from llama_index.node_parser import SimpleNodeParser
|
||||||
|
|
||||||
from kotaemon.base import Document
|
from kotaemon.base import Document
|
||||||
from kotaemon.loaders import AutoReader
|
from kotaemon.loaders import AutoReader, UnstructuredReader
|
||||||
|
|
||||||
|
|
||||||
def test_pdf_reader():
|
def test_pdf_reader():
|
||||||
|
@ -26,3 +26,22 @@ def test_pdf_reader():
|
||||||
node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
|
node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
|
||||||
nodes = node_parser.get_nodes_from_documents(documents)
|
nodes = node_parser.get_nodes_from_documents(documents)
|
||||||
assert len(nodes) > 0
|
assert len(nodes) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_unstructured_pdf_reader():
|
||||||
|
reader = UnstructuredReader()
|
||||||
|
dirpath = Path(__file__).parent
|
||||||
|
input_path = dirpath / "resources/dummy.pdf"
|
||||||
|
documents = reader.load_data(input_path)
|
||||||
|
|
||||||
|
# check document reader output
|
||||||
|
assert len(documents) == 1
|
||||||
|
|
||||||
|
first_doc = documents[0]
|
||||||
|
assert isinstance(first_doc, Document)
|
||||||
|
assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
|
||||||
|
|
||||||
|
# split documents mode
|
||||||
|
documents = reader.load_data(input_path, split_documents=True)
|
||||||
|
# check document reader output
|
||||||
|
assert len(documents) == 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user