Add UnstructuredReader with support for various legacy files (.doc, .xls) (#99)

This commit is contained in:
Tuan Anh Nguyen Dang (Tadashi_Cin)
2023-12-05 16:19:13 +07:00
committed by GitHub
parent 37c744b616
commit d9e925eb75
3 changed files with 132 additions and 1 deletions

View File

@@ -4,7 +4,7 @@ from langchain.schema import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser
from kotaemon.base import Document
from kotaemon.loaders import AutoReader
from kotaemon.loaders import AutoReader, UnstructuredReader
def test_pdf_reader():
@@ -26,3 +26,22 @@ def test_pdf_reader():
node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(documents)
assert len(nodes) > 0
def test_unstructured_pdf_reader():
reader = UnstructuredReader()
dirpath = Path(__file__).parent
input_path = dirpath / "resources/dummy.pdf"
documents = reader.load_data(input_path)
# check document reader output
assert len(documents) == 1
first_doc = documents[0]
assert isinstance(first_doc, Document)
assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
# split documents mode
documents = reader.load_data(input_path, split_documents=True)
# check document reader output
assert len(documents) == 1