Update splitters + metadata extractor interface to conform with new LlamaIndex design (#81)
* change splitter to general doc parsers class to fit new llama-index desing * moving interface of splitter
This commit is contained in:
parent
98c76c4700
commit
98509f886c
|
@ -1,18 +1,17 @@
|
||||||
from typing import Any, List, Sequence, Type
|
from typing import Any, Sequence, Type
|
||||||
|
|
||||||
|
from llama_index.extractors import SummaryExtractor as LISummaryExtractor
|
||||||
|
from llama_index.extractors import TitleExtractor as LITitleExtractor
|
||||||
from llama_index.node_parser import (
|
from llama_index.node_parser import (
|
||||||
SentenceWindowNodeParser as LISentenceWindowNodeParser,
|
SentenceWindowNodeParser as LISentenceWindowNodeParser,
|
||||||
)
|
)
|
||||||
from llama_index.node_parser import SimpleNodeParser as LISimpleNodeParser
|
|
||||||
from llama_index.node_parser.interface import NodeParser
|
from llama_index.node_parser.interface import NodeParser
|
||||||
from llama_index.text_splitter import TokenTextSplitter
|
from llama_index.text_splitter import TokenTextSplitter as LITokenTextSplitter
|
||||||
|
|
||||||
from ..base import BaseComponent, Document
|
from ..base import BaseComponent, Document
|
||||||
|
|
||||||
__all__ = ["TokenTextSplitter"]
|
|
||||||
|
|
||||||
|
class LIDocParser(BaseComponent):
|
||||||
class LINodeParser(BaseComponent):
|
|
||||||
_parser_class: Type[NodeParser]
|
_parser_class: Type[NodeParser]
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
@ -32,39 +31,28 @@ class LINodeParser(BaseComponent):
|
||||||
def __getattr__(self, name: str) -> Any:
|
def __getattr__(self, name: str) -> Any:
|
||||||
return getattr(self._parser, name)
|
return getattr(self._parser, name)
|
||||||
|
|
||||||
def get_nodes_from_documents(
|
def run(
|
||||||
self,
|
self,
|
||||||
documents: Sequence[Document],
|
documents: Sequence[Document],
|
||||||
show_progress: bool = False,
|
**kwargs,
|
||||||
) -> List[Document]:
|
) -> Sequence[Document]:
|
||||||
documents = self._parser.get_nodes_from_documents(
|
documents = self._parser(documents, **kwargs)
|
||||||
documents=documents, show_progress=show_progress
|
|
||||||
)
|
|
||||||
# convert Document to new base class from kotaemon
|
# convert Document to new base class from kotaemon
|
||||||
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
|
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
|
||||||
return converted_documents
|
return converted_documents
|
||||||
|
|
||||||
def run(
|
|
||||||
self,
|
class TokenSplitter(LIDocParser):
|
||||||
documents: Sequence[Document],
|
_parser_class = LITokenTextSplitter
|
||||||
show_progress: bool = False,
|
|
||||||
) -> List[Document]:
|
|
||||||
return self.get_nodes_from_documents(
|
|
||||||
documents=documents, show_progress=show_progress
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleNodeParser(LINodeParser):
|
class SentenceWindowNodeParser(LIDocParser):
|
||||||
_parser_class = LISimpleNodeParser
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
chunk_size = kwargs.pop("chunk_size", 512)
|
|
||||||
chunk_overlap = kwargs.pop("chunk_overlap", 0)
|
|
||||||
kwargs["text_splitter"] = TokenTextSplitter(
|
|
||||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
||||||
)
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class SentenceWindowNodeParser(LINodeParser):
|
|
||||||
_parser_class = LISentenceWindowNodeParser
|
_parser_class = LISentenceWindowNodeParser
|
||||||
|
|
||||||
|
|
||||||
|
class TitleExtractor(LIDocParser):
|
||||||
|
_parser_class = LITitleExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class SummaryExtractor(LIDocParser):
|
||||||
|
_parser_class = LISummaryExtractor
|
|
@ -2,14 +2,14 @@ import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Sequence, Union
|
from typing import Dict, List, Optional, Sequence, Union
|
||||||
|
|
||||||
from llama_index.node_parser.extractors import MetadataExtractor
|
|
||||||
from llama_index.readers.base import BaseReader
|
from llama_index.readers.base import BaseReader
|
||||||
from theflow import Node
|
from theflow import Node
|
||||||
from theflow.utils.modules import ObjectInitDeclaration as _
|
from theflow.utils.modules import ObjectInitDeclaration as _
|
||||||
|
|
||||||
from kotaemon.base import BaseComponent
|
from kotaemon.base import BaseComponent
|
||||||
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||||
from kotaemon.indexing.splitters import SimpleNodeParser
|
from kotaemon.indexing.doc_parsers import LIDocParser as DocParser
|
||||||
|
from kotaemon.indexing.doc_parsers import TokenSplitter
|
||||||
from kotaemon.loaders import (
|
from kotaemon.loaders import (
|
||||||
AutoReader,
|
AutoReader,
|
||||||
DirectoryReader,
|
DirectoryReader,
|
||||||
|
@ -45,7 +45,7 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||||
chunk_overlap: int = 256
|
chunk_overlap: int = 256
|
||||||
vector_store: _[BaseVectorStore] = _(InMemoryVectorStore)
|
vector_store: _[BaseVectorStore] = _(InMemoryVectorStore)
|
||||||
doc_store: _[BaseDocumentStore] = _(InMemoryDocumentStore)
|
doc_store: _[BaseDocumentStore] = _(InMemoryDocumentStore)
|
||||||
metadata_extractor: Optional[MetadataExtractor] = None
|
doc_parsers: List[DocParser] = []
|
||||||
|
|
||||||
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
|
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
|
||||||
model="text-embedding-ada-002",
|
model="text-embedding-ada-002",
|
||||||
|
@ -81,11 +81,10 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||||
)
|
)
|
||||||
|
|
||||||
@Node.auto(depends_on=["chunk_size", "chunk_overlap"])
|
@Node.auto(depends_on=["chunk_size", "chunk_overlap"])
|
||||||
def text_splitter(self) -> SimpleNodeParser:
|
def text_splitter(self) -> TokenSplitter:
|
||||||
return SimpleNodeParser(
|
return TokenSplitter(
|
||||||
chunk_size=self.chunk_size,
|
chunk_size=self.chunk_size,
|
||||||
chunk_overlap=self.chunk_overlap,
|
chunk_overlap=self.chunk_overlap,
|
||||||
metadata_extractor=self.metadata_extractor,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
|
@ -111,6 +110,11 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||||
nodes = self.text_splitter(documents)
|
nodes = self.text_splitter(documents)
|
||||||
self.log_progress(".num_docs", num_docs=len(nodes))
|
self.log_progress(".num_docs", num_docs=len(nodes))
|
||||||
|
|
||||||
|
# document parsers call
|
||||||
|
if self.doc_parsers:
|
||||||
|
for parser in self.doc_parsers:
|
||||||
|
nodes = parser(nodes)
|
||||||
|
|
||||||
self.indexing_vector_pipeline(nodes)
|
self.indexing_vector_pipeline(nodes)
|
||||||
# persist right after indexing
|
# persist right after indexing
|
||||||
self.indexing_vector_pipeline.save(file_storage_path)
|
self.indexing_vector_pipeline.save(file_storage_path)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user