[AUR-391, AUR-393] Add Document and DocumentReader base (#6)

* Declare BaseComponent

* Brainstorming base class for LLM call

* Define base LLM

* Add tests

* Clean telemetry environment for accurate testing

* Fix README

* Fix typing

* add base document reader

* update test

* update requirements

* Cosmetic change

* update requirements

* reformat

---------

Co-authored-by: trducng <trungduc1992@gmail.com>
This commit is contained in:
Tuan Anh Nguyen Dang (Tadashi_Cin)
2023-08-31 11:24:12 +07:00
committed by GitHub
parent 4211315a54
commit 21350153d4
6 changed files with 82 additions and 6 deletions

View File

@@ -1,10 +1,26 @@
class DocumentLoader:
"""Document loader"""
from pathlib import Path
from typing import Any, List, Type, Union
from llama_index import download_loader
from llama_index.readers.base import BaseReader
from ..documents.base import Document
class TextManipulator:
"""Text manipulation"""
class AutoReader(BaseReader):
"""General auto reader for a variety of files. (based on llama-hub)"""
def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None:
"""Init reader using string identifier or class name from llama-hub"""
class DocumentManipulator:
"""Document manipulation"""
if isinstance(reader_type, str):
self._reader = download_loader(reader_type)()
else:
self._reader = reader_type()
def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:
documents = self._reader.load_data(file=file, **kwargs)
# convert Document to new base class from kotaemon
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
return converted_documents