From 98509f886ca205e8af27701beca0447405d180e6 Mon Sep 17 00:00:00 2001 From: "Tuan Anh Nguyen Dang (Tadashi_Cin)" Date: Mon, 20 Nov 2023 10:09:30 +0700 Subject: [PATCH] Update splitters + metadata extractor interface to conform with new LlamaIndex design (#81) * change splitter to general doc parsers class to fit new llama-index desing * moving interface of splitter --- .../indexing/{splitters.py => doc_parsers.py} | 54 ++++++++----------- knowledgehub/pipelines/ingest.py | 16 +++--- 2 files changed, 31 insertions(+), 39 deletions(-) rename knowledgehub/indexing/{splitters.py => doc_parsers.py} (50%) diff --git a/knowledgehub/indexing/splitters.py b/knowledgehub/indexing/doc_parsers.py similarity index 50% rename from knowledgehub/indexing/splitters.py rename to knowledgehub/indexing/doc_parsers.py index ac05d5c..83e2cd5 100644 --- a/knowledgehub/indexing/splitters.py +++ b/knowledgehub/indexing/doc_parsers.py @@ -1,18 +1,17 @@ -from typing import Any, List, Sequence, Type +from typing import Any, Sequence, Type +from llama_index.extractors import SummaryExtractor as LISummaryExtractor +from llama_index.extractors import TitleExtractor as LITitleExtractor from llama_index.node_parser import ( SentenceWindowNodeParser as LISentenceWindowNodeParser, ) -from llama_index.node_parser import SimpleNodeParser as LISimpleNodeParser from llama_index.node_parser.interface import NodeParser -from llama_index.text_splitter import TokenTextSplitter +from llama_index.text_splitter import TokenTextSplitter as LITokenTextSplitter from ..base import BaseComponent, Document -__all__ = ["TokenTextSplitter"] - -class LINodeParser(BaseComponent): +class LIDocParser(BaseComponent): _parser_class: Type[NodeParser] def __init__(self, *args, **kwargs): @@ -32,39 +31,28 @@ class LINodeParser(BaseComponent): def __getattr__(self, name: str) -> Any: return getattr(self._parser, name) - def get_nodes_from_documents( + def run( self, documents: Sequence[Document], - show_progress: bool = False, - ) -> List[Document]: - documents = self._parser.get_nodes_from_documents( - documents=documents, show_progress=show_progress - ) + **kwargs, + ) -> Sequence[Document]: + documents = self._parser(documents, **kwargs) # convert Document to new base class from kotaemon converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents] return converted_documents - def run( - self, - documents: Sequence[Document], - show_progress: bool = False, - ) -> List[Document]: - return self.get_nodes_from_documents( - documents=documents, show_progress=show_progress - ) + +class TokenSplitter(LIDocParser): + _parser_class = LITokenTextSplitter -class SimpleNodeParser(LINodeParser): - _parser_class = LISimpleNodeParser - - def __init__(self, *args, **kwargs): - chunk_size = kwargs.pop("chunk_size", 512) - chunk_overlap = kwargs.pop("chunk_overlap", 0) - kwargs["text_splitter"] = TokenTextSplitter( - chunk_size=chunk_size, chunk_overlap=chunk_overlap - ) - super().__init__(*args, **kwargs) - - -class SentenceWindowNodeParser(LINodeParser): +class SentenceWindowNodeParser(LIDocParser): _parser_class = LISentenceWindowNodeParser + + +class TitleExtractor(LIDocParser): + _parser_class = LITitleExtractor + + +class SummaryExtractor(LIDocParser): + _parser_class = LISummaryExtractor diff --git a/knowledgehub/pipelines/ingest.py b/knowledgehub/pipelines/ingest.py index 78b7ee4..af56853 100644 --- a/knowledgehub/pipelines/ingest.py +++ b/knowledgehub/pipelines/ingest.py @@ -2,14 +2,14 @@ import os from pathlib import Path from typing import Dict, List, Optional, Sequence, Union -from llama_index.node_parser.extractors import MetadataExtractor from llama_index.readers.base import BaseReader from theflow import Node from theflow.utils.modules import ObjectInitDeclaration as _ from kotaemon.base import BaseComponent from kotaemon.embeddings import AzureOpenAIEmbeddings -from kotaemon.indexing.splitters import SimpleNodeParser +from kotaemon.indexing.doc_parsers import LIDocParser as DocParser +from kotaemon.indexing.doc_parsers import TokenSplitter from kotaemon.loaders import ( AutoReader, DirectoryReader, @@ -45,7 +45,7 @@ class ReaderIndexingPipeline(BaseComponent): chunk_overlap: int = 256 vector_store: _[BaseVectorStore] = _(InMemoryVectorStore) doc_store: _[BaseDocumentStore] = _(InMemoryDocumentStore) - metadata_extractor: Optional[MetadataExtractor] = None + doc_parsers: List[DocParser] = [] embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx( model="text-embedding-ada-002", @@ -81,11 +81,10 @@ class ReaderIndexingPipeline(BaseComponent): ) @Node.auto(depends_on=["chunk_size", "chunk_overlap"]) - def text_splitter(self) -> SimpleNodeParser: - return SimpleNodeParser( + def text_splitter(self) -> TokenSplitter: + return TokenSplitter( chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap, - metadata_extractor=self.metadata_extractor, ) def run( @@ -111,6 +110,11 @@ class ReaderIndexingPipeline(BaseComponent): nodes = self.text_splitter(documents) self.log_progress(".num_docs", num_docs=len(nodes)) + # document parsers call + if self.doc_parsers: + for parser in self.doc_parsers: + nodes = parser(nodes) + self.indexing_vector_pipeline(nodes) # persist right after indexing self.indexing_vector_pipeline.save(file_storage_path)