[AUR-395] Adopt Example1 disclaimer pipeline (#42)

* Adopt Example1 disclaimer pipeline
* Update Document class
* Add composite components
* Modify Extractor behaviours
This commit is contained in:
ian_Cin
2023-10-10 15:42:48 +07:00
committed by GitHub
parent 79cc60e6a2
commit 84f1fa8cbd
12 changed files with 654 additions and 37 deletions

View File

@@ -1,12 +1,43 @@
from typing import Any, Optional
from haystack.schema import Document as HaystackDocument
from llama_index.bridge.pydantic import Field
from llama_index.schema import Document as BaseDocument
from pyparsing import TypeVar
IO_Type = TypeVar("IO_Type", "Document", str)
SAMPLE_TEXT = "A sample Document from kotaemon"
class Document(BaseDocument):
"""Base document class, mostly inherited from Document class from llama-index"""
"""
Base document class, mostly inherited from Document class from llama-index.
This class accept one positional argument `content` of an arbitrary type, which will
store the raw content of the document. If specified, the class will use
`content` to initialize the base llama_index class.
"""
content: Any
def __init__(self, content: Optional[Any] = None, *args, **kwargs):
if content is None:
if kwargs.get("text", None) is not None:
kwargs["content"] = kwargs["text"]
elif kwargs.get("embedding", None) is not None:
kwargs["content"] = kwargs["embedding"]
elif isinstance(content, Document):
kwargs = content.dict()
else:
kwargs["content"] = content
if content:
kwargs["text"] = str(content)
else:
kwargs["text"] = ""
super().__init__(*args, **kwargs)
def __bool__(self):
return bool(self.content)
@classmethod
def example(cls) -> "Document":
@@ -23,7 +54,7 @@ class Document(BaseDocument):
return HaystackDocument(content=text, meta=metadata)
def __str__(self):
return self.text
return str(self.content)
class RetrievedDocument(Document):