75 lines
2.4 KiB
Python
75 lines
2.4 KiB
Python
from typing import TYPE_CHECKING, Any, Optional, TypeVar
|
|
|
|
from llama_index.bridge.pydantic import Field
|
|
from llama_index.schema import Document as BaseDocument
|
|
|
|
if TYPE_CHECKING:
|
|
from haystack.schema import Document as HaystackDocument
|
|
|
|
IO_Type = TypeVar("IO_Type", "Document", str)
|
|
SAMPLE_TEXT = "A sample Document from kotaemon"
|
|
|
|
|
|
class Document(BaseDocument):
|
|
"""
|
|
Base document class, mostly inherited from Document class from llama-index.
|
|
|
|
This class accept one positional argument `content` of an arbitrary type, which will
|
|
store the raw content of the document. If specified, the class will use
|
|
`content` to initialize the base llama_index class.
|
|
"""
|
|
|
|
content: Any
|
|
|
|
def __init__(self, content: Optional[Any] = None, *args, **kwargs):
|
|
if content is None:
|
|
if kwargs.get("text", None) is not None:
|
|
kwargs["content"] = kwargs["text"]
|
|
elif kwargs.get("embedding", None) is not None:
|
|
kwargs["content"] = kwargs["embedding"]
|
|
elif isinstance(content, Document):
|
|
kwargs = content.dict()
|
|
else:
|
|
kwargs["content"] = content
|
|
if content:
|
|
kwargs["text"] = str(content)
|
|
else:
|
|
kwargs["text"] = ""
|
|
super().__init__(*args, **kwargs)
|
|
|
|
def __bool__(self):
|
|
return bool(self.content)
|
|
|
|
@classmethod
|
|
def example(cls) -> "Document":
|
|
document = Document(
|
|
text=SAMPLE_TEXT,
|
|
metadata={"filename": "README.md", "category": "codebase"},
|
|
)
|
|
return document
|
|
|
|
def to_haystack_format(self) -> "HaystackDocument":
|
|
"""Convert struct to Haystack document format."""
|
|
from haystack.schema import Document as HaystackDocument
|
|
|
|
metadata = self.metadata or {}
|
|
text = self.text
|
|
return HaystackDocument(content=text, meta=metadata)
|
|
|
|
def __str__(self):
|
|
return str(self.content)
|
|
|
|
|
|
class RetrievedDocument(Document):
|
|
"""Subclass of Document with retrieval-related information
|
|
|
|
Attributes:
|
|
score (float): score of the document (from 0.0 to 1.0)
|
|
retrieval_metadata (dict): metadata from the retrieval process, can be used
|
|
by different components in a retrieved pipeline to communicate with each
|
|
other
|
|
"""
|
|
|
|
score: float = Field(default=0.0)
|
|
retrieval_metadata: dict = Field(default={})
|