diff --git a/knowledgehub/documents/base.py b/knowledgehub/documents/base.py
new file mode 100644
index 0000000..d9c1981
--- /dev/null
+++ b/knowledgehub/documents/base.py
@@ -0,0 +1,22 @@
+from haystack.schema import Document as HaystackDocument
+from llama_index.schema import Document as BaseDocument
+
+SAMPLE_TEXT = "A sample Document from kotaemon"
+
+
+class Document(BaseDocument):
+    """Base document class, mostly inherited from Document class from llama-index"""
+
+    @classmethod
+    def example(cls) -> "Document":
+        document = Document(
+            text=SAMPLE_TEXT,
+            metadata={"filename": "README.md", "category": "codebase"},
+        )
+        return document
+
+    def to_haystack_format(self) -> HaystackDocument:
+        """Convert struct to Haystack document format."""
+        metadata = self.metadata or {}
+        text = self.text
+        return HaystackDocument(content=text, meta=metadata)
diff --git a/knowledgehub/loaders/__init__.py b/knowledgehub/loaders/__init__.py
index e69de29..1e3e992 100644
--- a/knowledgehub/loaders/__init__.py
+++ b/knowledgehub/loaders/__init__.py
@@ -0,0 +1,3 @@
+from .base import AutoReader
+
+__all__ = ["AutoReader"]
diff --git a/knowledgehub/loaders/base.py b/knowledgehub/loaders/base.py
index f21e2ec..804e765 100644
--- a/knowledgehub/loaders/base.py
+++ b/knowledgehub/loaders/base.py
@@ -1,10 +1,26 @@
-class DocumentLoader:
-    """Document loader"""
+from pathlib import Path
+from typing import Any, List, Type, Union
+
+from llama_index import download_loader
+from llama_index.readers.base import BaseReader
+
+from ..documents.base import Document
 
 
-class TextManipulator:
-    """Text manipulation"""
+class AutoReader(BaseReader):
+    """General auto reader for a variety of files. (based on llama-hub)"""
 
+    def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None:
+        """Init reader using string identifier or class name from llama-hub"""
 
-class DocumentManipulator:
-    """Document manipulation"""
+        if isinstance(reader_type, str):
+            self._reader = download_loader(reader_type)()
+        else:
+            self._reader = reader_type()
+
+    def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:
+        documents = self._reader.load_data(file=file, **kwargs)
+
+        # convert Document to new base class from kotaemon
+        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
+        return converted_documents
diff --git a/setup.py b/setup.py
index 94839d0..ef37e8e 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,9 @@ setuptools.setup(
         "farm-haystack==1.19.0",
         "langchain",
         "theflow",
+        "llama-index",
+        "llama-hub",
+        "nltk",
     ],
     extras_require={
         "dev": [
diff --git a/tests/resources/dummy.pdf b/tests/resources/dummy.pdf
new file mode 100644
index 0000000..774c2ea
Binary files /dev/null and b/tests/resources/dummy.pdf differ
diff --git a/tests/test_reader.py b/tests/test_reader.py
new file mode 100644
index 0000000..d31ddb7
--- /dev/null
+++ b/tests/test_reader.py
@@ -0,0 +1,32 @@
+from pathlib import Path
+
+from langchain.schema import Document as LangchainDocument
+from llama_index.node_parser import SimpleNodeParser
+
+from kotaemon.documents.base import Document, HaystackDocument
+from kotaemon.loaders import AutoReader
+
+
+def test_pdf_reader():
+    reader = AutoReader("PDFReader")
+    dirpath = Path(__file__).parent
+    documents = reader.load_data(dirpath / "resources/dummy.pdf")
+
+    # check document reader output
+    assert len(documents) == 1
+
+    first_doc = documents[0]
+    assert isinstance(first_doc, Document)
+    assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
+
+    # check conversion output
+    haystack_doc = first_doc.to_haystack_format()
+    assert isinstance(haystack_doc, HaystackDocument)
+
+    langchain_doc = first_doc.to_langchain_format()
+    assert isinstance(langchain_doc, LangchainDocument)
+
+    # test chunking using NodeParser from llama-index
+    node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
+    nodes = node_parser.get_nodes_from_documents(documents)
+    assert len(nodes) > 0