diff --git a/knowledgehub/documents/base.py b/knowledgehub/documents/base.py new file mode 100644 index 0000000..d9c1981 --- /dev/null +++ b/knowledgehub/documents/base.py @@ -0,0 +1,22 @@ +from haystack.schema import Document as HaystackDocument +from llama_index.schema import Document as BaseDocument + +SAMPLE_TEXT = "A sample Document from kotaemon" + + +class Document(BaseDocument): + """Base document class, mostly inherited from Document class from llama-index""" + + @classmethod + def example(cls) -> "Document": + document = Document( + text=SAMPLE_TEXT, + metadata={"filename": "README.md", "category": "codebase"}, + ) + return document + + def to_haystack_format(self) -> HaystackDocument: + """Convert struct to Haystack document format.""" + metadata = self.metadata or {} + text = self.text + return HaystackDocument(content=text, meta=metadata) diff --git a/knowledgehub/loaders/__init__.py b/knowledgehub/loaders/__init__.py index e69de29..1e3e992 100644 --- a/knowledgehub/loaders/__init__.py +++ b/knowledgehub/loaders/__init__.py @@ -0,0 +1,3 @@ +from .base import AutoReader + +__all__ = ["AutoReader"] diff --git a/knowledgehub/loaders/base.py b/knowledgehub/loaders/base.py index f21e2ec..804e765 100644 --- a/knowledgehub/loaders/base.py +++ b/knowledgehub/loaders/base.py @@ -1,10 +1,26 @@ -class DocumentLoader: - """Document loader""" +from pathlib import Path +from typing import Any, List, Type, Union + +from llama_index import download_loader +from llama_index.readers.base import BaseReader + +from ..documents.base import Document -class TextManipulator: - """Text manipulation""" +class AutoReader(BaseReader): + """General auto reader for a variety of files. (based on llama-hub)""" + def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None: + """Init reader using string identifier or class name from llama-hub""" -class DocumentManipulator: - """Document manipulation""" + if isinstance(reader_type, str): + self._reader = download_loader(reader_type)() + else: + self._reader = reader_type() + + def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]: + documents = self._reader.load_data(file=file, **kwargs) + + # convert Document to new base class from kotaemon + converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents] + return converted_documents diff --git a/setup.py b/setup.py index 94839d0..ef37e8e 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,9 @@ setuptools.setup( "farm-haystack==1.19.0", "langchain", "theflow", + "llama-index", + "llama-hub", + "nltk", ], extras_require={ "dev": [ diff --git a/tests/resources/dummy.pdf b/tests/resources/dummy.pdf new file mode 100644 index 0000000..774c2ea Binary files /dev/null and b/tests/resources/dummy.pdf differ diff --git a/tests/test_reader.py b/tests/test_reader.py new file mode 100644 index 0000000..d31ddb7 --- /dev/null +++ b/tests/test_reader.py @@ -0,0 +1,32 @@ +from pathlib import Path + +from langchain.schema import Document as LangchainDocument +from llama_index.node_parser import SimpleNodeParser + +from kotaemon.documents.base import Document, HaystackDocument +from kotaemon.loaders import AutoReader + + +def test_pdf_reader(): + reader = AutoReader("PDFReader") + dirpath = Path(__file__).parent + documents = reader.load_data(dirpath / "resources/dummy.pdf") + + # check document reader output + assert len(documents) == 1 + + first_doc = documents[0] + assert isinstance(first_doc, Document) + assert first_doc.text.lower().replace(" ", "") == "dummypdffile" + + # check conversion output + haystack_doc = first_doc.to_haystack_format() + assert isinstance(haystack_doc, HaystackDocument) + + langchain_doc = first_doc.to_langchain_format() + assert isinstance(langchain_doc, LangchainDocument) + + # test chunking using NodeParser from llama-index + node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20) + nodes = node_parser.get_nodes_from_documents(documents) + assert len(nodes) > 0