Separate rerankers, splitters and extractors (#85)
This commit is contained in:
parent
0dede9c82d
commit
2186c5558f
|
@ -1,58 +0,0 @@
|
||||||
from typing import Any, Sequence, Type
|
|
||||||
|
|
||||||
from llama_index.extractors import SummaryExtractor as LISummaryExtractor
|
|
||||||
from llama_index.extractors import TitleExtractor as LITitleExtractor
|
|
||||||
from llama_index.node_parser import (
|
|
||||||
SentenceWindowNodeParser as LISentenceWindowNodeParser,
|
|
||||||
)
|
|
||||||
from llama_index.node_parser.interface import NodeParser
|
|
||||||
from llama_index.text_splitter import TokenTextSplitter as LITokenTextSplitter
|
|
||||||
|
|
||||||
from ..base import BaseComponent, Document
|
|
||||||
|
|
||||||
|
|
||||||
class LIDocParser(BaseComponent):
|
|
||||||
_parser_class: Type[NodeParser]
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
if self._parser_class is None:
|
|
||||||
raise AttributeError(
|
|
||||||
"Require `_parser_class` to set a NodeParser class from LlamarIndex"
|
|
||||||
)
|
|
||||||
self._parser = self._parser_class(*args, **kwargs)
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def __setattr__(self, name: str, value: Any) -> None:
|
|
||||||
if name.startswith("_") or name in self._protected_keywords():
|
|
||||||
return super().__setattr__(name, value)
|
|
||||||
|
|
||||||
return setattr(self._parser, name, value)
|
|
||||||
|
|
||||||
def __getattr__(self, name: str) -> Any:
|
|
||||||
return getattr(self._parser, name)
|
|
||||||
|
|
||||||
def run(
|
|
||||||
self,
|
|
||||||
documents: Sequence[Document],
|
|
||||||
**kwargs,
|
|
||||||
) -> Sequence[Document]:
|
|
||||||
documents = self._parser(documents, **kwargs)
|
|
||||||
# convert Document to new base class from kotaemon
|
|
||||||
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
|
|
||||||
return converted_documents
|
|
||||||
|
|
||||||
|
|
||||||
class TokenSplitter(LIDocParser):
|
|
||||||
_parser_class = LITokenTextSplitter
|
|
||||||
|
|
||||||
|
|
||||||
class SentenceWindowNodeParser(LIDocParser):
|
|
||||||
_parser_class = LISentenceWindowNodeParser
|
|
||||||
|
|
||||||
|
|
||||||
class TitleExtractor(LIDocParser):
|
|
||||||
_parser_class = LITitleExtractor
|
|
||||||
|
|
||||||
|
|
||||||
class SummaryExtractor(LIDocParser):
|
|
||||||
_parser_class = LISummaryExtractor
|
|
72
knowledgehub/indices/base.py
Normal file
72
knowledgehub/indices/base.py
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import abstractmethod
|
||||||
|
from typing import Any, Sequence, Type
|
||||||
|
|
||||||
|
from llama_index.node_parser.interface import NodeParser
|
||||||
|
|
||||||
|
from ..base import BaseComponent, Document
|
||||||
|
|
||||||
|
|
||||||
|
class DocTransformer(BaseComponent):
|
||||||
|
"""This is a base class for document transformers
|
||||||
|
|
||||||
|
A document transformer transforms a list of documents into another list
|
||||||
|
of documents. Transforming can mean splitting a document into multiple documents,
|
||||||
|
reducing a large list of documents into a smaller list of documents, or adding
|
||||||
|
metadata to each document in a list of documents, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def run(
|
||||||
|
self,
|
||||||
|
documents: Sequence[Document],
|
||||||
|
**kwargs,
|
||||||
|
) -> Sequence[Document]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaIndexMixin:
|
||||||
|
"""Allow automatically wrapping a Llama-index component into kotaemon component
|
||||||
|
|
||||||
|
Example:
|
||||||
|
class TokenSplitter(LlamaIndexMixin, BaseSplitter):
|
||||||
|
def _get_li_class(self):
|
||||||
|
from llama_index.text_splitter import TokenTextSplitter
|
||||||
|
return TokenTextSplitter
|
||||||
|
|
||||||
|
To use this mixin, please:
|
||||||
|
1. Use this class as the 1st parent class, so that Python will prefer to use
|
||||||
|
the attributes and methods of this class whenever possible.
|
||||||
|
2. Overwrite `_get_li_class` to return the relevant LlamaIndex component.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_li_class(self) -> Type[NodeParser]:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Please return the relevant LlamaIndex class in _get_li_class"
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
_li_cls = self._get_li_class()
|
||||||
|
self._obj = _li_cls(*args, **kwargs)
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def __setattr__(self, name: str, value: Any) -> None:
|
||||||
|
if name.startswith("_") or name in self._protected_keywords():
|
||||||
|
return super().__setattr__(name, value)
|
||||||
|
|
||||||
|
return setattr(self._obj, name, value)
|
||||||
|
|
||||||
|
def __getattr__(self, name: str) -> Any:
|
||||||
|
return getattr(self._obj, name)
|
||||||
|
|
||||||
|
def run(
|
||||||
|
self,
|
||||||
|
documents: Sequence[Document],
|
||||||
|
**kwargs,
|
||||||
|
) -> Sequence[Document]:
|
||||||
|
"""Run Llama-index node parser and convert the output to Document from
|
||||||
|
kotaemon
|
||||||
|
"""
|
||||||
|
docs = self._obj(documents, **kwargs) # type: ignore
|
||||||
|
return [Document.from_dict(doc.to_dict()) for doc in docs]
|
7
knowledgehub/indices/extractors/__init__.py
Normal file
7
knowledgehub/indices/extractors/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
from .doc_parsers import BaseDocParser, SummaryExtractor, TitleExtractor
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"BaseDocParser",
|
||||||
|
"TitleExtractor",
|
||||||
|
"SummaryExtractor",
|
||||||
|
]
|
19
knowledgehub/indices/extractors/doc_parsers.py
Normal file
19
knowledgehub/indices/extractors/doc_parsers.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
from ..base import DocTransformer, LlamaIndexMixin
|
||||||
|
|
||||||
|
|
||||||
|
class BaseDocParser(DocTransformer):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class TitleExtractor(LlamaIndexMixin, BaseDocParser):
|
||||||
|
def _get_li_class(self):
|
||||||
|
from llama_index.extractors import TitleExtractor
|
||||||
|
|
||||||
|
return TitleExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class SummaryExtractor(LlamaIndexMixin, BaseDocParser):
|
||||||
|
def _get_li_class(self):
|
||||||
|
from llama_index.extractors import SummaryExtractor
|
||||||
|
|
||||||
|
return SummaryExtractor
|
5
knowledgehub/indices/rankings/__init__.py
Normal file
5
knowledgehub/indices/rankings/__init__.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from .base import BaseReranking
|
||||||
|
from .cohere import CohereReranking
|
||||||
|
from .llm import LLMReranking
|
||||||
|
|
||||||
|
__all__ = ["CohereReranking", "LLMReranking", "BaseReranking"]
|
13
knowledgehub/indices/rankings/base.py
Normal file
13
knowledgehub/indices/rankings/base.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import abstractmethod
|
||||||
|
|
||||||
|
from ...base import BaseComponent, Document
|
||||||
|
|
||||||
|
|
||||||
|
class BaseReranking(BaseComponent):
|
||||||
|
@abstractmethod
|
||||||
|
def run(self, documents: list[Document], query: str) -> list[Document]:
|
||||||
|
"""Main method to transform list of documents
|
||||||
|
(re-ranking, filtering, etc)"""
|
||||||
|
...
|
38
knowledgehub/indices/rankings/cohere.py
Normal file
38
knowledgehub/indices/rankings/cohere.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from ...base import Document
|
||||||
|
from .base import BaseReranking
|
||||||
|
|
||||||
|
|
||||||
|
class CohereReranking(BaseReranking):
|
||||||
|
model_name: str = "rerank-multilingual-v2.0"
|
||||||
|
cohere_api_key: str = os.environ.get("COHERE_API_KEY", "")
|
||||||
|
top_k: int = 1
|
||||||
|
|
||||||
|
def run(self, documents: list[Document], query: str) -> list[Document]:
|
||||||
|
"""Use Cohere Reranker model to re-order documents
|
||||||
|
with their relevance score"""
|
||||||
|
try:
|
||||||
|
import cohere
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Please install Cohere " "`pip install cohere` to use Cohere Reranking"
|
||||||
|
)
|
||||||
|
|
||||||
|
cohere_client = cohere.Client(self.cohere_api_key)
|
||||||
|
|
||||||
|
# output documents
|
||||||
|
compressed_docs = []
|
||||||
|
if len(documents) > 0: # to avoid empty api call
|
||||||
|
_docs = [d.content for d in documents]
|
||||||
|
results = cohere_client.rerank(
|
||||||
|
model=self.model_name, query=query, documents=_docs, top_n=self.top_k
|
||||||
|
)
|
||||||
|
for r in results:
|
||||||
|
doc = documents[r.index]
|
||||||
|
doc.metadata["relevance_score"] = r.relevance_score
|
||||||
|
compressed_docs.append(doc)
|
||||||
|
|
||||||
|
return compressed_docs
|
|
@ -1,62 +1,18 @@
|
||||||
import os
|
from __future__ import annotations
|
||||||
from abc import abstractmethod
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from typing import List, Optional, Union
|
from typing import Union
|
||||||
|
|
||||||
from langchain.output_parsers.boolean import BooleanOutputParser
|
from langchain.output_parsers.boolean import BooleanOutputParser
|
||||||
|
|
||||||
from ..base import BaseComponent
|
from ...base import Document
|
||||||
from ..base.schema import Document
|
from ...llms import PromptTemplate
|
||||||
from ..llms import PromptTemplate
|
from ...llms.chats.base import ChatLLM
|
||||||
from ..llms.chats.base import ChatLLM
|
from ...llms.completions.base import LLM
|
||||||
from ..llms.completions.base import LLM
|
from .base import BaseReranking
|
||||||
|
|
||||||
BaseLLM = Union[ChatLLM, LLM]
|
BaseLLM = Union[ChatLLM, LLM]
|
||||||
|
|
||||||
|
|
||||||
class BaseRerankingPipeline(BaseComponent):
|
|
||||||
@abstractmethod
|
|
||||||
def run(self, documents: List[Document], query: str) -> List[Document]:
|
|
||||||
"""Main method to transform list of documents
|
|
||||||
(re-ranking, filtering, etc)"""
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
class CohereReranking(BaseRerankingPipeline):
|
|
||||||
model_name: str = "rerank-multilingual-v2.0"
|
|
||||||
cohere_api_key: Optional[str] = None
|
|
||||||
top_k: int = 1
|
|
||||||
|
|
||||||
def run(self, documents: List[Document], query: str) -> List[Document]:
|
|
||||||
"""Use Cohere Reranker model to re-order documents
|
|
||||||
with their relevance score"""
|
|
||||||
try:
|
|
||||||
import cohere
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(
|
|
||||||
"Please install Cohere " "`pip install cohere` to use Cohere Reranking"
|
|
||||||
)
|
|
||||||
|
|
||||||
cohere_api_key = (
|
|
||||||
self.cohere_api_key if self.cohere_api_key else os.environ["COHERE_API_KEY"]
|
|
||||||
)
|
|
||||||
cohere_client = cohere.Client(cohere_api_key)
|
|
||||||
|
|
||||||
# output documents
|
|
||||||
compressed_docs = []
|
|
||||||
if len(documents) > 0: # to avoid empty api call
|
|
||||||
_docs = [d.content for d in documents]
|
|
||||||
results = cohere_client.rerank(
|
|
||||||
model=self.model_name, query=query, documents=_docs, top_n=self.top_k
|
|
||||||
)
|
|
||||||
for r in results:
|
|
||||||
doc = documents[r.index]
|
|
||||||
doc.metadata["relevance_score"] = r.relevance_score
|
|
||||||
compressed_docs.append(doc)
|
|
||||||
|
|
||||||
return compressed_docs
|
|
||||||
|
|
||||||
|
|
||||||
RERANK_PROMPT_TEMPLATE = """Given the following question and context,
|
RERANK_PROMPT_TEMPLATE = """Given the following question and context,
|
||||||
return YES if the context is relevant to the question and NO if it isn't.
|
return YES if the context is relevant to the question and NO if it isn't.
|
||||||
|
|
||||||
|
@ -68,7 +24,7 @@ return YES if the context is relevant to the question and NO if it isn't.
|
||||||
> Relevant (YES / NO):"""
|
> Relevant (YES / NO):"""
|
||||||
|
|
||||||
|
|
||||||
class LLMReranking(BaseRerankingPipeline):
|
class LLMReranking(BaseReranking):
|
||||||
llm: BaseLLM
|
llm: BaseLLM
|
||||||
prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)
|
prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)
|
||||||
top_k: int = 3
|
top_k: int = 3
|
||||||
|
@ -76,9 +32,9 @@ class LLMReranking(BaseRerankingPipeline):
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
documents: List[Document],
|
documents: list[Document],
|
||||||
query: str,
|
query: str,
|
||||||
) -> List[Document]:
|
) -> list[Document]:
|
||||||
"""Filter down documents based on their relevance to the query."""
|
"""Filter down documents based on their relevance to the query."""
|
||||||
filtered_docs = []
|
filtered_docs = []
|
||||||
output_parser = BooleanOutputParser()
|
output_parser = BooleanOutputParser()
|
21
knowledgehub/indices/splitters/__init__.py
Normal file
21
knowledgehub/indices/splitters/__init__.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from ..base import DocTransformer, LlamaIndexMixin
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSplitter(DocTransformer):
|
||||||
|
"""Represent base splitter class"""
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class TokenSplitter(LlamaIndexMixin, BaseSplitter):
|
||||||
|
def _get_li_class(self):
|
||||||
|
from llama_index.text_splitter import TokenTextSplitter
|
||||||
|
|
||||||
|
return TokenTextSplitter
|
||||||
|
|
||||||
|
|
||||||
|
class SentenceWindowSplitter(LlamaIndexMixin, BaseSplitter):
|
||||||
|
def _get_li_class(self):
|
||||||
|
from llama_index.node_parser import SentenceWindowNodeParser
|
||||||
|
|
||||||
|
return SentenceWindowNodeParser
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import cast
|
from typing import Optional, cast
|
||||||
|
|
||||||
from ..base import BaseComponent, Document
|
from ..base import BaseComponent, Document
|
||||||
from ..embeddings import BaseEmbeddings
|
from ..embeddings import BaseEmbeddings
|
||||||
|
@ -22,7 +22,7 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vector_store: BaseVectorStore
|
vector_store: BaseVectorStore
|
||||||
doc_store: BaseDocumentStore
|
doc_store: Optional[BaseDocumentStore] = None
|
||||||
embedding: BaseEmbeddings
|
embedding: BaseEmbeddings
|
||||||
# TODO: refer to llama_index's storage as well
|
# TODO: refer to llama_index's storage as well
|
||||||
|
|
||||||
|
@ -64,7 +64,8 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
|
||||||
if isinstance(path, str):
|
if isinstance(path, str):
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
self.vector_store.save(path / vectorstore_fname)
|
self.vector_store.save(path / vectorstore_fname)
|
||||||
self.doc_store.save(path / docstore_fname)
|
if self.doc_store:
|
||||||
|
self.doc_store.save(path / docstore_fname)
|
||||||
|
|
||||||
def load(
|
def load(
|
||||||
self,
|
self,
|
||||||
|
@ -76,4 +77,5 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
|
||||||
if isinstance(path, str):
|
if isinstance(path, str):
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
self.vector_store.load(path / vectorstore_fname)
|
self.vector_store.load(path / vectorstore_fname)
|
||||||
self.doc_store.load(path / docstore_fname)
|
if self.doc_store:
|
||||||
|
self.doc_store.load(path / docstore_fname)
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Sequence, Union
|
from typing import Optional, Sequence
|
||||||
|
|
||||||
from llama_index.readers.base import BaseReader
|
from llama_index.readers.base import BaseReader
|
||||||
from theflow import Node
|
from theflow import Node
|
||||||
|
@ -8,8 +10,9 @@ from theflow.utils.modules import ObjectInitDeclaration as _
|
||||||
|
|
||||||
from kotaemon.base import BaseComponent
|
from kotaemon.base import BaseComponent
|
||||||
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||||
from kotaemon.indexing.doc_parsers import LIDocParser as DocParser
|
from kotaemon.indices.extractors import BaseDocParser
|
||||||
from kotaemon.indexing.doc_parsers import TokenSplitter
|
from kotaemon.indices.rankings import BaseReranking
|
||||||
|
from kotaemon.indices.splitters import TokenSplitter
|
||||||
from kotaemon.loaders import (
|
from kotaemon.loaders import (
|
||||||
AutoReader,
|
AutoReader,
|
||||||
DirectoryReader,
|
DirectoryReader,
|
||||||
|
@ -19,7 +22,6 @@ from kotaemon.loaders import (
|
||||||
)
|
)
|
||||||
from kotaemon.pipelines.agents import BaseAgent
|
from kotaemon.pipelines.agents import BaseAgent
|
||||||
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
|
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
|
||||||
from kotaemon.pipelines.reranking import BaseRerankingPipeline
|
|
||||||
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||||
from kotaemon.storages import (
|
from kotaemon.storages import (
|
||||||
BaseDocumentStore,
|
BaseDocumentStore,
|
||||||
|
@ -45,7 +47,7 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||||
chunk_overlap: int = 256
|
chunk_overlap: int = 256
|
||||||
vector_store: BaseVectorStore = _(InMemoryVectorStore)
|
vector_store: BaseVectorStore = _(InMemoryVectorStore)
|
||||||
doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
|
doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
|
||||||
doc_parsers: List[DocParser] = []
|
doc_parsers: list[BaseDocParser] = []
|
||||||
|
|
||||||
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
|
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
|
||||||
model="text-embedding-ada-002",
|
model="text-embedding-ada-002",
|
||||||
|
@ -55,9 +57,9 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||||
chunk_size=16,
|
chunk_size=16,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_reader(self, input_files: List[Union[str, Path]]):
|
def get_reader(self, input_files: list[str | Path]):
|
||||||
# document parsers
|
# document parsers
|
||||||
file_extractor: Dict[str, BaseReader] = {
|
file_extractor: dict[str, BaseReader | AutoReader] = {
|
||||||
".xlsx": PandasExcelReader(),
|
".xlsx": PandasExcelReader(),
|
||||||
}
|
}
|
||||||
if self.reader_name == "normal":
|
if self.reader_name == "normal":
|
||||||
|
@ -89,7 +91,7 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
file_path_list: Union[List[Union[str, Path]], Union[str, Path]],
|
file_path_list: list[str | Path] | str | Path,
|
||||||
force_reindex: Optional[bool] = False,
|
force_reindex: Optional[bool] = False,
|
||||||
):
|
):
|
||||||
self.storage_path.mkdir(exist_ok=True)
|
self.storage_path.mkdir(exist_ok=True)
|
||||||
|
@ -121,9 +123,7 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||||
else:
|
else:
|
||||||
self.indexing_vector_pipeline.load(file_storage_path)
|
self.indexing_vector_pipeline.load(file_storage_path)
|
||||||
|
|
||||||
def to_retrieving_pipeline(
|
def to_retrieving_pipeline(self, top_k=3, rerankers: Sequence[BaseReranking] = []):
|
||||||
self, top_k=3, rerankers: Sequence[BaseRerankingPipeline] = []
|
|
||||||
):
|
|
||||||
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
|
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
|
||||||
vector_store=self.vector_store,
|
vector_store=self.vector_store,
|
||||||
doc_store=self.doc_store,
|
doc_store=self.doc_store,
|
||||||
|
@ -141,7 +141,7 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||||
doc_store=self.doc_store,
|
doc_store=self.doc_store,
|
||||||
embedding=self.embedding,
|
embedding=self.embedding,
|
||||||
llm=llm,
|
llm=llm,
|
||||||
**kwargs
|
**kwargs,
|
||||||
)
|
)
|
||||||
return qa_pipeline
|
return qa_pipeline
|
||||||
|
|
||||||
|
@ -153,7 +153,7 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||||
doc_store=self.doc_store,
|
doc_store=self.doc_store,
|
||||||
embedding=self.embedding,
|
embedding=self.embedding,
|
||||||
agent=agent,
|
agent=agent,
|
||||||
**kwargs
|
**kwargs,
|
||||||
)
|
)
|
||||||
agent_pipeline.add_search_tool()
|
agent_pipeline.add_search_tool()
|
||||||
return agent_pipeline
|
return agent_pipeline
|
||||||
|
|
|
@ -8,11 +8,11 @@ from theflow.utils.modules import ObjectInitDeclaration as _
|
||||||
from kotaemon.base import BaseComponent
|
from kotaemon.base import BaseComponent
|
||||||
from kotaemon.base.schema import Document, RetrievedDocument
|
from kotaemon.base.schema import Document, RetrievedDocument
|
||||||
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||||
|
from kotaemon.indices.rankings import BaseReranking
|
||||||
from kotaemon.llms import PromptTemplate
|
from kotaemon.llms import PromptTemplate
|
||||||
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
||||||
from kotaemon.pipelines.agents import BaseAgent
|
from kotaemon.pipelines.agents import BaseAgent
|
||||||
from kotaemon.pipelines.citation import CitationPipeline
|
from kotaemon.pipelines.citation import CitationPipeline
|
||||||
from kotaemon.pipelines.reranking import BaseRerankingPipeline
|
|
||||||
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||||
from kotaemon.pipelines.tools import ComponentTool
|
from kotaemon.pipelines.tools import ComponentTool
|
||||||
from kotaemon.storages import (
|
from kotaemon.storages import (
|
||||||
|
@ -51,7 +51,7 @@ class QuestionAnsweringPipeline(BaseComponent):
|
||||||
|
|
||||||
vector_store: BaseVectorStore = _(InMemoryVectorStore)
|
vector_store: BaseVectorStore = _(InMemoryVectorStore)
|
||||||
doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
|
doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
|
||||||
rerankers: Sequence[BaseRerankingPipeline] = []
|
rerankers: Sequence[BaseReranking] = []
|
||||||
|
|
||||||
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
|
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
|
||||||
model="text-embedding-ada-002",
|
model="text-embedding-ada-002",
|
||||||
|
|
|
@ -3,11 +3,12 @@ from __future__ import annotations
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Sequence
|
from typing import Optional, Sequence
|
||||||
|
|
||||||
|
from kotaemon.indices.rankings import BaseReranking
|
||||||
|
|
||||||
from ..base import BaseComponent
|
from ..base import BaseComponent
|
||||||
from ..base.schema import Document, RetrievedDocument
|
from ..base.schema import Document, RetrievedDocument
|
||||||
from ..embeddings import BaseEmbeddings
|
from ..embeddings import BaseEmbeddings
|
||||||
from ..storages import BaseDocumentStore, BaseVectorStore
|
from ..storages import BaseDocumentStore, BaseVectorStore
|
||||||
from .reranking import BaseRerankingPipeline
|
|
||||||
|
|
||||||
VECTOR_STORE_FNAME = "vectorstore"
|
VECTOR_STORE_FNAME = "vectorstore"
|
||||||
DOC_STORE_FNAME = "docstore"
|
DOC_STORE_FNAME = "docstore"
|
||||||
|
@ -19,7 +20,7 @@ class RetrieveDocumentFromVectorStorePipeline(BaseComponent):
|
||||||
vector_store: BaseVectorStore
|
vector_store: BaseVectorStore
|
||||||
doc_store: BaseDocumentStore
|
doc_store: BaseDocumentStore
|
||||||
embedding: BaseEmbeddings
|
embedding: BaseEmbeddings
|
||||||
rerankers: Sequence[BaseRerankingPipeline] = []
|
rerankers: Sequence[BaseReranking] = []
|
||||||
top_k: int = 1
|
top_k: int = 1
|
||||||
# TODO: refer to llama_index's storage as well
|
# TODO: refer to llama_index's storage as well
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,8 @@ import pytest
|
||||||
from openai.types.chat.chat_completion import ChatCompletion
|
from openai.types.chat.chat_completion import ChatCompletion
|
||||||
|
|
||||||
from kotaemon.base import Document
|
from kotaemon.base import Document
|
||||||
|
from kotaemon.indices.rankings import LLMReranking
|
||||||
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
||||||
from kotaemon.pipelines.reranking import LLMReranking
|
|
||||||
|
|
||||||
_openai_chat_completion_responses = [
|
_openai_chat_completion_responses = [
|
||||||
ChatCompletion.parse_obj(
|
ChatCompletion.parse_obj(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user