Update splitters + metadata extractor interface to conform with new LlamaIndex design (#81)

* change splitter to general doc parsers class to fit new llama-index desing
* moving interface of splitter
This commit is contained in:
Tuan Anh Nguyen Dang (Tadashi_Cin)
2023-11-20 10:09:30 +07:00
committed by GitHub
parent 98c76c4700
commit 98509f886c
2 changed files with 31 additions and 39 deletions

View File

@@ -1,18 +1,17 @@
from typing import Any, List, Sequence, Type
from typing import Any, Sequence, Type
from llama_index.extractors import SummaryExtractor as LISummaryExtractor
from llama_index.extractors import TitleExtractor as LITitleExtractor
from llama_index.node_parser import (
SentenceWindowNodeParser as LISentenceWindowNodeParser,
)
from llama_index.node_parser import SimpleNodeParser as LISimpleNodeParser
from llama_index.node_parser.interface import NodeParser
from llama_index.text_splitter import TokenTextSplitter
from llama_index.text_splitter import TokenTextSplitter as LITokenTextSplitter
from ..base import BaseComponent, Document
__all__ = ["TokenTextSplitter"]
class LINodeParser(BaseComponent):
class LIDocParser(BaseComponent):
_parser_class: Type[NodeParser]
def __init__(self, *args, **kwargs):
@@ -32,39 +31,28 @@ class LINodeParser(BaseComponent):
def __getattr__(self, name: str) -> Any:
return getattr(self._parser, name)
def get_nodes_from_documents(
def run(
self,
documents: Sequence[Document],
show_progress: bool = False,
) -> List[Document]:
documents = self._parser.get_nodes_from_documents(
documents=documents, show_progress=show_progress
)
**kwargs,
) -> Sequence[Document]:
documents = self._parser(documents, **kwargs)
# convert Document to new base class from kotaemon
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
return converted_documents
def run(
self,
documents: Sequence[Document],
show_progress: bool = False,
) -> List[Document]:
return self.get_nodes_from_documents(
documents=documents, show_progress=show_progress
)
class TokenSplitter(LIDocParser):
_parser_class = LITokenTextSplitter
class SimpleNodeParser(LINodeParser):
_parser_class = LISimpleNodeParser
def __init__(self, *args, **kwargs):
chunk_size = kwargs.pop("chunk_size", 512)
chunk_overlap = kwargs.pop("chunk_overlap", 0)
kwargs["text_splitter"] = TokenTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
super().__init__(*args, **kwargs)
class SentenceWindowNodeParser(LINodeParser):
class SentenceWindowNodeParser(LIDocParser):
_parser_class = LISentenceWindowNodeParser
class TitleExtractor(LIDocParser):
_parser_class = LITitleExtractor
class SummaryExtractor(LIDocParser):
_parser_class = LISummaryExtractor