Separate rerankers, splitters and extractors (#85)

2023-11-27 14:25:54 +07:00
parent 0dede9c82d
commit 2186c5558f
15 changed files with 211 additions and 135 deletions
--- a/knowledgehub/indexing/doc_parsers.py
+++ b/knowledgehub/indexing/doc_parsers.py
@@ -1,58 +0,0 @@
 from typing import Any, Sequence, Type
 from llama_index.extractors import SummaryExtractor as LISummaryExtractor
 from llama_index.extractors import TitleExtractor as LITitleExtractor
 from llama_index.node_parser import (
    SentenceWindowNodeParser as LISentenceWindowNodeParser,
 )
 from llama_index.node_parser.interface import NodeParser
 from llama_index.text_splitter import TokenTextSplitter as LITokenTextSplitter
 from ..base import BaseComponent, Document
 class LIDocParser(BaseComponent):
    _parser_class: Type[NodeParser]
    def __init__(self, *args, **kwargs):
        if self._parser_class is None:
            raise AttributeError(
                "Require `_parser_class` to set a NodeParser class from LlamarIndex"
            )
        self._parser = self._parser_class(*args, **kwargs)
        super().__init__()
    def __setattr__(self, name: str, value: Any) -> None:
        if name.startswith("_") or name in self._protected_keywords():
            return super().__setattr__(name, value)
        return setattr(self._parser, name, value)
    def __getattr__(self, name: str) -> Any:
        return getattr(self._parser, name)
    def run(
        self,
        documents: Sequence[Document],
        **kwargs,
    ) -> Sequence[Document]:
        documents = self._parser(documents, **kwargs)
        # convert Document to new base class from kotaemon
        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
        return converted_documents
 class TokenSplitter(LIDocParser):
    _parser_class = LITokenTextSplitter
 class SentenceWindowNodeParser(LIDocParser):
    _parser_class = LISentenceWindowNodeParser
 class TitleExtractor(LIDocParser):
    _parser_class = LITitleExtractor
 class SummaryExtractor(LIDocParser):
    _parser_class = LISummaryExtractor
--- a/knowledgehub/indexing/init.py
+++ b/knowledgehub/indexing/init.py
--- a/knowledgehub/indices/base.py
+++ b/knowledgehub/indices/base.py
@@ -0,0 +1,72 @@
 from __future__ import annotations
 from abc import abstractmethod
 from typing import Any, Sequence, Type
 from llama_index.node_parser.interface import NodeParser
 from ..base import BaseComponent, Document
 class DocTransformer(BaseComponent):
    """This is a base class for document transformers
    A document transformer transforms a list of documents into another list
    of documents. Transforming can mean splitting a document into multiple documents,
    reducing a large list of documents into a smaller list of documents, or adding
    metadata to each document in a list of documents, etc.
    """
    @abstractmethod
    def run(
        self,
        documents: Sequence[Document],
        **kwargs,
    ) -> Sequence[Document]:
        ...
 class LlamaIndexMixin:
    """Allow automatically wrapping a Llama-index component into kotaemon component
    Example:
        class TokenSplitter(LlamaIndexMixin, BaseSplitter):
            def _get_li_class(self):
                from llama_index.text_splitter import TokenTextSplitter
                return TokenTextSplitter
    To use this mixin, please:
        1. Use this class as the 1st parent class, so that Python will prefer to use
        the attributes and methods of this class whenever possible.
        2. Overwrite `_get_li_class` to return the relevant LlamaIndex component.
    """
    def _get_li_class(self) -> Type[NodeParser]:
        raise NotImplementedError(
            "Please return the relevant LlamaIndex class in _get_li_class"
        )
    def __init__(self, *args, **kwargs):
        _li_cls = self._get_li_class()
        self._obj = _li_cls(*args, **kwargs)
        super().__init__()
    def __setattr__(self, name: str, value: Any) -> None:
        if name.startswith("_") or name in self._protected_keywords():
            return super().__setattr__(name, value)
        return setattr(self._obj, name, value)
    def __getattr__(self, name: str) -> Any:
        return getattr(self._obj, name)
    def run(
        self,
        documents: Sequence[Document],
        **kwargs,
    ) -> Sequence[Document]:
        """Run Llama-index node parser and convert the output to Document from
        kotaemon
        """
        docs = self._obj(documents, **kwargs)  # type: ignore
        return [Document.from_dict(doc.to_dict()) for doc in docs]
--- a/knowledgehub/indices/extractors/init.py
+++ b/knowledgehub/indices/extractors/init.py
@@ -0,0 +1,7 @@
 from .doc_parsers import BaseDocParser, SummaryExtractor, TitleExtractor
 __all__ = [
    "BaseDocParser",
    "TitleExtractor",
    "SummaryExtractor",
 ]
--- a/knowledgehub/indices/extractors/doc_parsers.py
+++ b/knowledgehub/indices/extractors/doc_parsers.py
@@ -0,0 +1,19 @@
 from ..base import DocTransformer, LlamaIndexMixin
 class BaseDocParser(DocTransformer):
    ...
 class TitleExtractor(LlamaIndexMixin, BaseDocParser):
    def _get_li_class(self):
        from llama_index.extractors import TitleExtractor
        return TitleExtractor
 class SummaryExtractor(LlamaIndexMixin, BaseDocParser):
    def _get_li_class(self):
        from llama_index.extractors import SummaryExtractor
        return SummaryExtractor
--- a/knowledgehub/indices/rankings/init.py
+++ b/knowledgehub/indices/rankings/init.py
@@ -0,0 +1,5 @@
 from .base import BaseReranking
 from .cohere import CohereReranking
 from .llm import LLMReranking
 __all__ = ["CohereReranking", "LLMReranking", "BaseReranking"]
--- a/knowledgehub/indices/rankings/base.py
+++ b/knowledgehub/indices/rankings/base.py
@@ -0,0 +1,13 @@
 from __future__ import annotations
 from abc import abstractmethod
 from ...base import BaseComponent, Document
 class BaseReranking(BaseComponent):
    @abstractmethod
    def run(self, documents: list[Document], query: str) -> list[Document]:
        """Main method to transform list of documents
        (re-ranking, filtering, etc)"""
        ...
--- a/knowledgehub/indices/rankings/cohere.py
+++ b/knowledgehub/indices/rankings/cohere.py
@@ -0,0 +1,38 @@
 from __future__ import annotations
 import os
 from ...base import Document
 from .base import BaseReranking
 class CohereReranking(BaseReranking):
    model_name: str = "rerank-multilingual-v2.0"
    cohere_api_key: str = os.environ.get("COHERE_API_KEY", "")
    top_k: int = 1
    def run(self, documents: list[Document], query: str) -> list[Document]:
        """Use Cohere Reranker model to re-order documents
        with their relevance score"""
        try:
            import cohere
        except ImportError:
            raise ImportError(
                "Please install Cohere " "`pip install cohere` to use Cohere Reranking"
            )
        cohere_client = cohere.Client(self.cohere_api_key)
        # output documents
        compressed_docs = []
        if len(documents) > 0:  # to avoid empty api call
            _docs = [d.content for d in documents]
            results = cohere_client.rerank(
                model=self.model_name, query=query, documents=_docs, top_n=self.top_k
            )
            for r in results:
                doc = documents[r.index]
                doc.metadata["relevance_score"] = r.relevance_score
                compressed_docs.append(doc)
        return compressed_docs
--- a/knowledgehub/indices/rankings/llm.py
+++ b/knowledgehub/indices/rankings/llm.py
@@ -1,62 +1,18 @@
-import os
+from __future__ import annotations
-from abc import abstractmethod
+
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Optional, Union
+from typing import Union
 from langchain.output_parsers.boolean import BooleanOutputParser
-from ..base import BaseComponent
+from ...base import Document
-from ..base.schema import Document
+from ...llms import PromptTemplate
-from ..llms import PromptTemplate
+from ...llms.chats.base import ChatLLM
-from ..llms.chats.base import ChatLLM
+from ...llms.completions.base import LLM
-from ..llms.completions.base import LLM
+from .base import BaseReranking
 BaseLLM = Union[ChatLLM, LLM]
 class BaseRerankingPipeline(BaseComponent):
    @abstractmethod
    def run(self, documents: List[Document], query: str) -> List[Document]:
        """Main method to transform list of documents
        (re-ranking, filtering, etc)"""
        ...
 class CohereReranking(BaseRerankingPipeline):
    model_name: str = "rerank-multilingual-v2.0"
    cohere_api_key: Optional[str] = None
    top_k: int = 1
    def run(self, documents: List[Document], query: str) -> List[Document]:
        """Use Cohere Reranker model to re-order documents
        with their relevance score"""
        try:
            import cohere
        except ImportError:
            raise ImportError(
                "Please install Cohere " "`pip install cohere` to use Cohere Reranking"
            )
        cohere_api_key = (
            self.cohere_api_key if self.cohere_api_key else os.environ["COHERE_API_KEY"]
        )
        cohere_client = cohere.Client(cohere_api_key)
        # output documents
        compressed_docs = []
        if len(documents) > 0:  # to avoid empty api call
            _docs = [d.content for d in documents]
            results = cohere_client.rerank(
                model=self.model_name, query=query, documents=_docs, top_n=self.top_k
            )
            for r in results:
                doc = documents[r.index]
                doc.metadata["relevance_score"] = r.relevance_score
                compressed_docs.append(doc)
        return compressed_docs
 RERANK_PROMPT_TEMPLATE = """Given the following question and context,
 return YES if the context is relevant to the question and NO if it isn't.
@@ -68,7 +24,7 @@ return YES if the context is relevant to the question and NO if it isn't.
 > Relevant (YES / NO):"""
-class LLMReranking(BaseRerankingPipeline):
+class LLMReranking(BaseReranking):
    llm: BaseLLM
    prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)
    top_k: int = 3
@@ -76,9 +32,9 @@ class LLMReranking(BaseRerankingPipeline):
    def run(
        self,
-        documents: List[Document],
+        documents: list[Document],
        query: str,
-    ) -> List[Document]:
+    ) -> list[Document]:
        """Filter down documents based on their relevance to the query."""
        filtered_docs = []
        output_parser = BooleanOutputParser()
--- a/knowledgehub/indices/splitters/init.py
+++ b/knowledgehub/indices/splitters/init.py
@@ -0,0 +1,21 @@
 from ..base import DocTransformer, LlamaIndexMixin
 class BaseSplitter(DocTransformer):
    """Represent base splitter class"""
    ...
 class TokenSplitter(LlamaIndexMixin, BaseSplitter):
    def _get_li_class(self):
        from llama_index.text_splitter import TokenTextSplitter
        return TokenTextSplitter
 class SentenceWindowSplitter(LlamaIndexMixin, BaseSplitter):
    def _get_li_class(self):
        from llama_index.node_parser import SentenceWindowNodeParser
        return SentenceWindowNodeParser
--- a/knowledgehub/pipelines/indexing.py
+++ b/knowledgehub/pipelines/indexing.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 import uuid
 from pathlib import Path
-from typing import cast
+from typing import Optional, cast
 from ..base import BaseComponent, Document
 from ..embeddings import BaseEmbeddings
@@ -22,7 +22,7 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
    """
    vector_store: BaseVectorStore
-    doc_store: BaseDocumentStore
+    doc_store: Optional[BaseDocumentStore] = None
    embedding: BaseEmbeddings
    # TODO: refer to llama_index's storage as well
@@ -64,7 +64,8 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
        if isinstance(path, str):
            path = Path(path)
        self.vector_store.save(path / vectorstore_fname)
-        self.doc_store.save(path / docstore_fname)
+        if self.doc_store:
            self.doc_store.save(path / docstore_fname)
    def load(
        self,
@@ -76,4 +77,5 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
        if isinstance(path, str):
            path = Path(path)
        self.vector_store.load(path / vectorstore_fname)
-        self.doc_store.load(path / docstore_fname)
+        if self.doc_store:
            self.doc_store.load(path / docstore_fname)
--- a/knowledgehub/pipelines/ingest.py
+++ b/knowledgehub/pipelines/ingest.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 import os
 from pathlib import Path
-from typing import Dict, List, Optional, Sequence, Union
+from typing import Optional, Sequence
 from llama_index.readers.base import BaseReader
 from theflow import Node
@@ -8,8 +10,9 @@ from theflow.utils.modules import ObjectInitDeclaration as _
 from kotaemon.base import BaseComponent
 from kotaemon.embeddings import AzureOpenAIEmbeddings
-from kotaemon.indexing.doc_parsers import LIDocParser as DocParser
+from kotaemon.indices.extractors import BaseDocParser
-from kotaemon.indexing.doc_parsers import TokenSplitter
+from kotaemon.indices.rankings import BaseReranking
 from kotaemon.indices.splitters import TokenSplitter
 from kotaemon.loaders import (
    AutoReader,
    DirectoryReader,
@@ -19,7 +22,6 @@ from kotaemon.loaders import (
 )
 from kotaemon.pipelines.agents import BaseAgent
 from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
 from kotaemon.pipelines.reranking import BaseRerankingPipeline
 from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
 from kotaemon.storages import (
    BaseDocumentStore,
@@ -45,7 +47,7 @@ class ReaderIndexingPipeline(BaseComponent):
    chunk_overlap: int = 256
    vector_store: BaseVectorStore = _(InMemoryVectorStore)
    doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
-    doc_parsers: List[DocParser] = []
+    doc_parsers: list[BaseDocParser] = []
    embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
        model="text-embedding-ada-002",
@@ -55,9 +57,9 @@ class ReaderIndexingPipeline(BaseComponent):
        chunk_size=16,
    )
-    def get_reader(self, input_files: List[Union[str, Path]]):
+    def get_reader(self, input_files: list[str | Path]):
        # document parsers
-        file_extractor: Dict[str, BaseReader] = {
+        file_extractor: dict[str, BaseReader | AutoReader] = {
            ".xlsx": PandasExcelReader(),
        }
        if self.reader_name == "normal":
@@ -89,7 +91,7 @@ class ReaderIndexingPipeline(BaseComponent):
    def run(
        self,
-        file_path_list: Union[List[Union[str, Path]], Union[str, Path]],
+        file_path_list: list[str | Path] | str | Path,
        force_reindex: Optional[bool] = False,
    ):
        self.storage_path.mkdir(exist_ok=True)
@@ -121,9 +123,7 @@ class ReaderIndexingPipeline(BaseComponent):
        else:
            self.indexing_vector_pipeline.load(file_storage_path)
-    def to_retrieving_pipeline(
+    def to_retrieving_pipeline(self, top_k=3, rerankers: Sequence[BaseReranking] = []):
        self, top_k=3, rerankers: Sequence[BaseRerankingPipeline] = []
    ):
        retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
            vector_store=self.vector_store,
            doc_store=self.doc_store,
@@ -141,7 +141,7 @@ class ReaderIndexingPipeline(BaseComponent):
            doc_store=self.doc_store,
            embedding=self.embedding,
            llm=llm,
-            **kwargs
+            **kwargs,
        )
        return qa_pipeline
@@ -153,7 +153,7 @@ class ReaderIndexingPipeline(BaseComponent):
            doc_store=self.doc_store,
            embedding=self.embedding,
            agent=agent,
-            **kwargs
+            **kwargs,
        )
        agent_pipeline.add_search_tool()
        return agent_pipeline
--- a/knowledgehub/pipelines/qa.py
+++ b/knowledgehub/pipelines/qa.py
@@ -8,11 +8,11 @@ from theflow.utils.modules import ObjectInitDeclaration as _
 from kotaemon.base import BaseComponent
 from kotaemon.base.schema import Document, RetrievedDocument
 from kotaemon.embeddings import AzureOpenAIEmbeddings
 from kotaemon.indices.rankings import BaseReranking
 from kotaemon.llms import PromptTemplate
 from kotaemon.llms.chats.openai import AzureChatOpenAI
 from kotaemon.pipelines.agents import BaseAgent
 from kotaemon.pipelines.citation import CitationPipeline
 from kotaemon.pipelines.reranking import BaseRerankingPipeline
 from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
 from kotaemon.pipelines.tools import ComponentTool
 from kotaemon.storages import (
@@ -51,7 +51,7 @@ class QuestionAnsweringPipeline(BaseComponent):
    vector_store: BaseVectorStore = _(InMemoryVectorStore)
    doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
-    rerankers: Sequence[BaseRerankingPipeline] = []
+    rerankers: Sequence[BaseReranking] = []
    embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
        model="text-embedding-ada-002",
--- a/knowledgehub/pipelines/retrieving.py
+++ b/knowledgehub/pipelines/retrieving.py
@@ -3,11 +3,12 @@ from __future__ import annotations
 from pathlib import Path
 from typing import Optional, Sequence
 from kotaemon.indices.rankings import BaseReranking
 from ..base import BaseComponent
 from ..base.schema import Document, RetrievedDocument
 from ..embeddings import BaseEmbeddings
 from ..storages import BaseDocumentStore, BaseVectorStore
 from .reranking import BaseRerankingPipeline
 VECTOR_STORE_FNAME = "vectorstore"
 DOC_STORE_FNAME = "docstore"
@@ -19,7 +20,7 @@ class RetrieveDocumentFromVectorStorePipeline(BaseComponent):
    vector_store: BaseVectorStore
    doc_store: BaseDocumentStore
    embedding: BaseEmbeddings
-    rerankers: Sequence[BaseRerankingPipeline] = []
+    rerankers: Sequence[BaseReranking] = []
    top_k: int = 1
    # TODO: refer to llama_index's storage as well
--- a/tests/test_reranking.py
+++ b/tests/test_reranking.py
@@ -4,8 +4,8 @@ import pytest
 from openai.types.chat.chat_completion import ChatCompletion
 from kotaemon.base import Document
 from kotaemon.indices.rankings import LLMReranking
 from kotaemon.llms.chats.openai import AzureChatOpenAI
 from kotaemon.pipelines.reranking import LLMReranking
 _openai_chat_completion_responses = [
    ChatCompletion.parse_obj(