Separate rerankers, splitters and extractors (#85)

This commit is contained in:
Nguyen Trung Duc (john) 2023-11-27 14:25:54 +07:00 committed by GitHub
parent 0dede9c82d
commit 2186c5558f
15 changed files with 211 additions and 135 deletions

View File

@ -1,58 +0,0 @@
from typing import Any, Sequence, Type
from llama_index.extractors import SummaryExtractor as LISummaryExtractor
from llama_index.extractors import TitleExtractor as LITitleExtractor
from llama_index.node_parser import (
SentenceWindowNodeParser as LISentenceWindowNodeParser,
)
from llama_index.node_parser.interface import NodeParser
from llama_index.text_splitter import TokenTextSplitter as LITokenTextSplitter
from ..base import BaseComponent, Document
class LIDocParser(BaseComponent):
_parser_class: Type[NodeParser]
def __init__(self, *args, **kwargs):
if self._parser_class is None:
raise AttributeError(
"Require `_parser_class` to set a NodeParser class from LlamarIndex"
)
self._parser = self._parser_class(*args, **kwargs)
super().__init__()
def __setattr__(self, name: str, value: Any) -> None:
if name.startswith("_") or name in self._protected_keywords():
return super().__setattr__(name, value)
return setattr(self._parser, name, value)
def __getattr__(self, name: str) -> Any:
return getattr(self._parser, name)
def run(
self,
documents: Sequence[Document],
**kwargs,
) -> Sequence[Document]:
documents = self._parser(documents, **kwargs)
# convert Document to new base class from kotaemon
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
return converted_documents
class TokenSplitter(LIDocParser):
_parser_class = LITokenTextSplitter
class SentenceWindowNodeParser(LIDocParser):
_parser_class = LISentenceWindowNodeParser
class TitleExtractor(LIDocParser):
_parser_class = LITitleExtractor
class SummaryExtractor(LIDocParser):
_parser_class = LISummaryExtractor

View File

@ -0,0 +1,72 @@
from __future__ import annotations
from abc import abstractmethod
from typing import Any, Sequence, Type
from llama_index.node_parser.interface import NodeParser
from ..base import BaseComponent, Document
class DocTransformer(BaseComponent):
"""This is a base class for document transformers
A document transformer transforms a list of documents into another list
of documents. Transforming can mean splitting a document into multiple documents,
reducing a large list of documents into a smaller list of documents, or adding
metadata to each document in a list of documents, etc.
"""
@abstractmethod
def run(
self,
documents: Sequence[Document],
**kwargs,
) -> Sequence[Document]:
...
class LlamaIndexMixin:
"""Allow automatically wrapping a Llama-index component into kotaemon component
Example:
class TokenSplitter(LlamaIndexMixin, BaseSplitter):
def _get_li_class(self):
from llama_index.text_splitter import TokenTextSplitter
return TokenTextSplitter
To use this mixin, please:
1. Use this class as the 1st parent class, so that Python will prefer to use
the attributes and methods of this class whenever possible.
2. Overwrite `_get_li_class` to return the relevant LlamaIndex component.
"""
def _get_li_class(self) -> Type[NodeParser]:
raise NotImplementedError(
"Please return the relevant LlamaIndex class in _get_li_class"
)
def __init__(self, *args, **kwargs):
_li_cls = self._get_li_class()
self._obj = _li_cls(*args, **kwargs)
super().__init__()
def __setattr__(self, name: str, value: Any) -> None:
if name.startswith("_") or name in self._protected_keywords():
return super().__setattr__(name, value)
return setattr(self._obj, name, value)
def __getattr__(self, name: str) -> Any:
return getattr(self._obj, name)
def run(
self,
documents: Sequence[Document],
**kwargs,
) -> Sequence[Document]:
"""Run Llama-index node parser and convert the output to Document from
kotaemon
"""
docs = self._obj(documents, **kwargs) # type: ignore
return [Document.from_dict(doc.to_dict()) for doc in docs]

View File

@ -0,0 +1,7 @@
from .doc_parsers import BaseDocParser, SummaryExtractor, TitleExtractor
__all__ = [
"BaseDocParser",
"TitleExtractor",
"SummaryExtractor",
]

View File

@ -0,0 +1,19 @@
from ..base import DocTransformer, LlamaIndexMixin
class BaseDocParser(DocTransformer):
...
class TitleExtractor(LlamaIndexMixin, BaseDocParser):
def _get_li_class(self):
from llama_index.extractors import TitleExtractor
return TitleExtractor
class SummaryExtractor(LlamaIndexMixin, BaseDocParser):
def _get_li_class(self):
from llama_index.extractors import SummaryExtractor
return SummaryExtractor

View File

@ -0,0 +1,5 @@
from .base import BaseReranking
from .cohere import CohereReranking
from .llm import LLMReranking
__all__ = ["CohereReranking", "LLMReranking", "BaseReranking"]

View File

@ -0,0 +1,13 @@
from __future__ import annotations
from abc import abstractmethod
from ...base import BaseComponent, Document
class BaseReranking(BaseComponent):
@abstractmethod
def run(self, documents: list[Document], query: str) -> list[Document]:
"""Main method to transform list of documents
(re-ranking, filtering, etc)"""
...

View File

@ -0,0 +1,38 @@
from __future__ import annotations
import os
from ...base import Document
from .base import BaseReranking
class CohereReranking(BaseReranking):
model_name: str = "rerank-multilingual-v2.0"
cohere_api_key: str = os.environ.get("COHERE_API_KEY", "")
top_k: int = 1
def run(self, documents: list[Document], query: str) -> list[Document]:
"""Use Cohere Reranker model to re-order documents
with their relevance score"""
try:
import cohere
except ImportError:
raise ImportError(
"Please install Cohere " "`pip install cohere` to use Cohere Reranking"
)
cohere_client = cohere.Client(self.cohere_api_key)
# output documents
compressed_docs = []
if len(documents) > 0: # to avoid empty api call
_docs = [d.content for d in documents]
results = cohere_client.rerank(
model=self.model_name, query=query, documents=_docs, top_n=self.top_k
)
for r in results:
doc = documents[r.index]
doc.metadata["relevance_score"] = r.relevance_score
compressed_docs.append(doc)
return compressed_docs

View File

@ -1,62 +1,18 @@
import os
from abc import abstractmethod
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor
from typing import List, Optional, Union
from typing import Union
from langchain.output_parsers.boolean import BooleanOutputParser
from ..base import BaseComponent
from ..base.schema import Document
from ..llms import PromptTemplate
from ..llms.chats.base import ChatLLM
from ..llms.completions.base import LLM
from ...base import Document
from ...llms import PromptTemplate
from ...llms.chats.base import ChatLLM
from ...llms.completions.base import LLM
from .base import BaseReranking
BaseLLM = Union[ChatLLM, LLM]
class BaseRerankingPipeline(BaseComponent):
@abstractmethod
def run(self, documents: List[Document], query: str) -> List[Document]:
"""Main method to transform list of documents
(re-ranking, filtering, etc)"""
...
class CohereReranking(BaseRerankingPipeline):
model_name: str = "rerank-multilingual-v2.0"
cohere_api_key: Optional[str] = None
top_k: int = 1
def run(self, documents: List[Document], query: str) -> List[Document]:
"""Use Cohere Reranker model to re-order documents
with their relevance score"""
try:
import cohere
except ImportError:
raise ImportError(
"Please install Cohere " "`pip install cohere` to use Cohere Reranking"
)
cohere_api_key = (
self.cohere_api_key if self.cohere_api_key else os.environ["COHERE_API_KEY"]
)
cohere_client = cohere.Client(cohere_api_key)
# output documents
compressed_docs = []
if len(documents) > 0: # to avoid empty api call
_docs = [d.content for d in documents]
results = cohere_client.rerank(
model=self.model_name, query=query, documents=_docs, top_n=self.top_k
)
for r in results:
doc = documents[r.index]
doc.metadata["relevance_score"] = r.relevance_score
compressed_docs.append(doc)
return compressed_docs
RERANK_PROMPT_TEMPLATE = """Given the following question and context,
return YES if the context is relevant to the question and NO if it isn't.
@ -68,7 +24,7 @@ return YES if the context is relevant to the question and NO if it isn't.
> Relevant (YES / NO):"""
class LLMReranking(BaseRerankingPipeline):
class LLMReranking(BaseReranking):
llm: BaseLLM
prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)
top_k: int = 3
@ -76,9 +32,9 @@ class LLMReranking(BaseRerankingPipeline):
def run(
self,
documents: List[Document],
documents: list[Document],
query: str,
) -> List[Document]:
) -> list[Document]:
"""Filter down documents based on their relevance to the query."""
filtered_docs = []
output_parser = BooleanOutputParser()

View File

@ -0,0 +1,21 @@
from ..base import DocTransformer, LlamaIndexMixin
class BaseSplitter(DocTransformer):
"""Represent base splitter class"""
...
class TokenSplitter(LlamaIndexMixin, BaseSplitter):
def _get_li_class(self):
from llama_index.text_splitter import TokenTextSplitter
return TokenTextSplitter
class SentenceWindowSplitter(LlamaIndexMixin, BaseSplitter):
def _get_li_class(self):
from llama_index.node_parser import SentenceWindowNodeParser
return SentenceWindowNodeParser

View File

@ -2,7 +2,7 @@ from __future__ import annotations
import uuid
from pathlib import Path
from typing import cast
from typing import Optional, cast
from ..base import BaseComponent, Document
from ..embeddings import BaseEmbeddings
@ -22,7 +22,7 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
"""
vector_store: BaseVectorStore
doc_store: BaseDocumentStore
doc_store: Optional[BaseDocumentStore] = None
embedding: BaseEmbeddings
# TODO: refer to llama_index's storage as well
@ -64,7 +64,8 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
if isinstance(path, str):
path = Path(path)
self.vector_store.save(path / vectorstore_fname)
self.doc_store.save(path / docstore_fname)
if self.doc_store:
self.doc_store.save(path / docstore_fname)
def load(
self,
@ -76,4 +77,5 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
if isinstance(path, str):
path = Path(path)
self.vector_store.load(path / vectorstore_fname)
self.doc_store.load(path / docstore_fname)
if self.doc_store:
self.doc_store.load(path / docstore_fname)

View File

@ -1,6 +1,8 @@
from __future__ import annotations
import os
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Union
from typing import Optional, Sequence
from llama_index.readers.base import BaseReader
from theflow import Node
@ -8,8 +10,9 @@ from theflow.utils.modules import ObjectInitDeclaration as _
from kotaemon.base import BaseComponent
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.indexing.doc_parsers import LIDocParser as DocParser
from kotaemon.indexing.doc_parsers import TokenSplitter
from kotaemon.indices.extractors import BaseDocParser
from kotaemon.indices.rankings import BaseReranking
from kotaemon.indices.splitters import TokenSplitter
from kotaemon.loaders import (
AutoReader,
DirectoryReader,
@ -19,7 +22,6 @@ from kotaemon.loaders import (
)
from kotaemon.pipelines.agents import BaseAgent
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
from kotaemon.pipelines.reranking import BaseRerankingPipeline
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.storages import (
BaseDocumentStore,
@ -45,7 +47,7 @@ class ReaderIndexingPipeline(BaseComponent):
chunk_overlap: int = 256
vector_store: BaseVectorStore = _(InMemoryVectorStore)
doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
doc_parsers: List[DocParser] = []
doc_parsers: list[BaseDocParser] = []
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
model="text-embedding-ada-002",
@ -55,9 +57,9 @@ class ReaderIndexingPipeline(BaseComponent):
chunk_size=16,
)
def get_reader(self, input_files: List[Union[str, Path]]):
def get_reader(self, input_files: list[str | Path]):
# document parsers
file_extractor: Dict[str, BaseReader] = {
file_extractor: dict[str, BaseReader | AutoReader] = {
".xlsx": PandasExcelReader(),
}
if self.reader_name == "normal":
@ -89,7 +91,7 @@ class ReaderIndexingPipeline(BaseComponent):
def run(
self,
file_path_list: Union[List[Union[str, Path]], Union[str, Path]],
file_path_list: list[str | Path] | str | Path,
force_reindex: Optional[bool] = False,
):
self.storage_path.mkdir(exist_ok=True)
@ -121,9 +123,7 @@ class ReaderIndexingPipeline(BaseComponent):
else:
self.indexing_vector_pipeline.load(file_storage_path)
def to_retrieving_pipeline(
self, top_k=3, rerankers: Sequence[BaseRerankingPipeline] = []
):
def to_retrieving_pipeline(self, top_k=3, rerankers: Sequence[BaseReranking] = []):
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
vector_store=self.vector_store,
doc_store=self.doc_store,
@ -141,7 +141,7 @@ class ReaderIndexingPipeline(BaseComponent):
doc_store=self.doc_store,
embedding=self.embedding,
llm=llm,
**kwargs
**kwargs,
)
return qa_pipeline
@ -153,7 +153,7 @@ class ReaderIndexingPipeline(BaseComponent):
doc_store=self.doc_store,
embedding=self.embedding,
agent=agent,
**kwargs
**kwargs,
)
agent_pipeline.add_search_tool()
return agent_pipeline

View File

@ -8,11 +8,11 @@ from theflow.utils.modules import ObjectInitDeclaration as _
from kotaemon.base import BaseComponent
from kotaemon.base.schema import Document, RetrievedDocument
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.indices.rankings import BaseReranking
from kotaemon.llms import PromptTemplate
from kotaemon.llms.chats.openai import AzureChatOpenAI
from kotaemon.pipelines.agents import BaseAgent
from kotaemon.pipelines.citation import CitationPipeline
from kotaemon.pipelines.reranking import BaseRerankingPipeline
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.pipelines.tools import ComponentTool
from kotaemon.storages import (
@ -51,7 +51,7 @@ class QuestionAnsweringPipeline(BaseComponent):
vector_store: BaseVectorStore = _(InMemoryVectorStore)
doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
rerankers: Sequence[BaseRerankingPipeline] = []
rerankers: Sequence[BaseReranking] = []
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
model="text-embedding-ada-002",

View File

@ -3,11 +3,12 @@ from __future__ import annotations
from pathlib import Path
from typing import Optional, Sequence
from kotaemon.indices.rankings import BaseReranking
from ..base import BaseComponent
from ..base.schema import Document, RetrievedDocument
from ..embeddings import BaseEmbeddings
from ..storages import BaseDocumentStore, BaseVectorStore
from .reranking import BaseRerankingPipeline
VECTOR_STORE_FNAME = "vectorstore"
DOC_STORE_FNAME = "docstore"
@ -19,7 +20,7 @@ class RetrieveDocumentFromVectorStorePipeline(BaseComponent):
vector_store: BaseVectorStore
doc_store: BaseDocumentStore
embedding: BaseEmbeddings
rerankers: Sequence[BaseRerankingPipeline] = []
rerankers: Sequence[BaseReranking] = []
top_k: int = 1
# TODO: refer to llama_index's storage as well

View File

@ -4,8 +4,8 @@ import pytest
from openai.types.chat.chat_completion import ChatCompletion
from kotaemon.base import Document
from kotaemon.indices.rankings import LLMReranking
from kotaemon.llms.chats.openai import AzureChatOpenAI
from kotaemon.pipelines.reranking import LLMReranking
_openai_chat_completion_responses = [
ChatCompletion.parse_obj(