Refactor the index component and update the MVP insurance accordingly (#90)
Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex. Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
This commit is contained in:
committed by
GitHub
parent
8e3a1d193f
commit
e34b1e4c6d
@@ -6,8 +6,8 @@ from typing import Any
|
||||
from theflow import Param
|
||||
|
||||
from kotaemon.base.schema import Document
|
||||
from kotaemon.indices.qa import CitationPipeline
|
||||
from kotaemon.llms import LLM, ChatLLM, PromptTemplate
|
||||
from kotaemon.pipelines.citation import CitationPipeline
|
||||
|
||||
from ..base import AgentType, BaseAgent, BaseLLM, BaseTool
|
||||
from ..output.base import BaseScratchPad
|
||||
|
@@ -0,0 +1,3 @@
|
||||
from .vectorindex import VectorIndexing, VectorRetrieval
|
||||
|
||||
__all__ = ["VectorIndexing", "VectorRetrieval"]
|
||||
|
@@ -5,7 +5,7 @@ from typing import Any, Type
|
||||
|
||||
from llama_index.node_parser.interface import NodeParser
|
||||
|
||||
from ..base import BaseComponent, Document
|
||||
from kotaemon.base import BaseComponent, Document, RetrievedDocument
|
||||
|
||||
|
||||
class DocTransformer(BaseComponent):
|
||||
@@ -26,7 +26,7 @@ class DocTransformer(BaseComponent):
|
||||
...
|
||||
|
||||
|
||||
class LlamaIndexMixin:
|
||||
class LlamaIndexDocTransformerMixin:
|
||||
"""Allow automatically wrapping a Llama-index component into kotaemon component
|
||||
|
||||
Example:
|
||||
@@ -70,3 +70,23 @@ class LlamaIndexMixin:
|
||||
"""
|
||||
docs = self._obj(documents, **kwargs) # type: ignore
|
||||
return [Document.from_dict(doc.to_dict()) for doc in docs]
|
||||
|
||||
|
||||
class BaseIndexing(BaseComponent):
|
||||
"""Define the base interface for indexing pipeline"""
|
||||
|
||||
def to_retrieval_pipeline(self, **kwargs):
|
||||
"""Convert the indexing pipeline to a retrieval pipeline"""
|
||||
raise NotImplementedError
|
||||
|
||||
def to_qa_pipeline(self, **kwargs):
|
||||
"""Convert the indexing pipeline to a QA pipeline"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BaseRetrieval(BaseComponent):
|
||||
"""Define the base interface for retrieval pipeline"""
|
||||
|
||||
@abstractmethod
|
||||
def run(self, *args, **kwargs) -> list[RetrievedDocument]:
|
||||
...
|
||||
|
@@ -1,18 +1,18 @@
|
||||
from ..base import DocTransformer, LlamaIndexMixin
|
||||
from ..base import DocTransformer, LlamaIndexDocTransformerMixin
|
||||
|
||||
|
||||
class BaseDocParser(DocTransformer):
|
||||
...
|
||||
|
||||
|
||||
class TitleExtractor(LlamaIndexMixin, BaseDocParser):
|
||||
class TitleExtractor(LlamaIndexDocTransformerMixin, BaseDocParser):
|
||||
def _get_li_class(self):
|
||||
from llama_index.extractors import TitleExtractor
|
||||
|
||||
return TitleExtractor
|
||||
|
||||
|
||||
class SummaryExtractor(LlamaIndexMixin, BaseDocParser):
|
||||
class SummaryExtractor(LlamaIndexDocTransformerMixin, BaseDocParser):
|
||||
def _get_li_class(self):
|
||||
from llama_index.extractors import SummaryExtractor
|
||||
|
||||
|
3
knowledgehub/indices/ingests/__init__.py
Normal file
3
knowledgehub/indices/ingests/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .files import DocumentIngestor
|
||||
|
||||
__all__ = ["DocumentIngestor"]
|
75
knowledgehub/indices/ingests/files.py
Normal file
75
knowledgehub/indices/ingests/files.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from pathlib import Path
|
||||
|
||||
from llama_index.readers.base import BaseReader
|
||||
from theflow import Param
|
||||
|
||||
from kotaemon.base import BaseComponent, Document
|
||||
from kotaemon.indices.extractors import BaseDocParser
|
||||
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
|
||||
from kotaemon.loaders import (
|
||||
AutoReader,
|
||||
DirectoryReader,
|
||||
MathpixPDFReader,
|
||||
OCRReader,
|
||||
PandasExcelReader,
|
||||
)
|
||||
|
||||
|
||||
class DocumentIngestor(BaseComponent):
|
||||
"""Ingest common office document types into Document for indexing
|
||||
|
||||
Document types:
|
||||
- pdf
|
||||
- xlsx
|
||||
- docx
|
||||
"""
|
||||
|
||||
pdf_mode: str = "normal" # "normal", "mathpix", "ocr"
|
||||
doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])
|
||||
text_splitter: BaseSplitter = TokenSplitter.withx(
|
||||
chunk_size=1024,
|
||||
chunk_overlap=256,
|
||||
)
|
||||
|
||||
def _get_reader(self, input_files: list[str | Path]):
|
||||
"""Get appropriate readers for the input files based on file extension"""
|
||||
file_extractor: dict[str, AutoReader | BaseReader] = {
|
||||
".xlsx": PandasExcelReader(),
|
||||
}
|
||||
|
||||
if self.pdf_mode == "normal":
|
||||
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
|
||||
elif self.pdf_mode == "ocr":
|
||||
file_extractor[".pdf"] = OCRReader()
|
||||
else:
|
||||
file_extractor[".pdf"] = MathpixPDFReader()
|
||||
|
||||
main_reader = DirectoryReader(
|
||||
input_files=input_files,
|
||||
file_extractor=file_extractor,
|
||||
)
|
||||
|
||||
return main_reader
|
||||
|
||||
def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:
|
||||
"""Ingest the file paths into Document
|
||||
|
||||
Args:
|
||||
file_paths: list of file paths or a single file path
|
||||
|
||||
Returns:
|
||||
list of parsed Documents
|
||||
"""
|
||||
if not isinstance(file_paths, list):
|
||||
file_paths = [file_paths]
|
||||
|
||||
documents = self._get_reader(input_files=file_paths)()
|
||||
nodes = self.text_splitter(documents)
|
||||
self.log_progress(".num_docs", num_docs=len(nodes))
|
||||
|
||||
# document parsers call
|
||||
if self.doc_parsers:
|
||||
for parser in self.doc_parsers:
|
||||
nodes = parser(nodes)
|
||||
|
||||
return nodes
|
7
knowledgehub/indices/qa/__init__.py
Normal file
7
knowledgehub/indices/qa/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from .citation import CitationPipeline
|
||||
from .text_based import CitationQAPipeline
|
||||
|
||||
__all__ = [
|
||||
"CitationPipeline",
|
||||
"CitationQAPipeline",
|
||||
]
|
@@ -1,14 +1,10 @@
|
||||
from typing import Iterator, List, Union
|
||||
from typing import Iterator, List
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from kotaemon.base import BaseComponent
|
||||
from kotaemon.base.schema import HumanMessage, SystemMessage
|
||||
|
||||
from ..llms.chats.base import ChatLLM
|
||||
from ..llms.completions.base import LLM
|
||||
|
||||
BaseLLM = Union[ChatLLM, LLM]
|
||||
from kotaemon.llms import BaseLLM
|
||||
|
||||
|
||||
class FactWithEvidence(BaseModel):
|
62
knowledgehub/indices/qa/text_based.py
Normal file
62
knowledgehub/indices/qa/text_based.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import os
|
||||
|
||||
from kotaemon.base import BaseComponent, Document, RetrievedDocument
|
||||
from kotaemon.llms import AzureChatOpenAI, BaseLLM, PromptTemplate
|
||||
|
||||
from .citation import CitationPipeline
|
||||
|
||||
|
||||
class CitationQAPipeline(BaseComponent):
|
||||
"""Answering question from a text corpus with citation"""
|
||||
|
||||
qa_prompt_template: PromptTemplate = PromptTemplate(
|
||||
'Answer the following question: "{question}". '
|
||||
"The context is: \n{context}\nAnswer: "
|
||||
)
|
||||
llm: BaseLLM = AzureChatOpenAI.withx(
|
||||
azure_endpoint="https://bleh-dummy.openai.azure.com/",
|
||||
openai_api_key=os.environ.get("OPENAI_API_KEY", ""),
|
||||
openai_api_version="2023-07-01-preview",
|
||||
deployment_name="dummy-q2-16k",
|
||||
temperature=0,
|
||||
request_timeout=60,
|
||||
)
|
||||
|
||||
def _format_doc_text(self, text: str) -> str:
|
||||
"""Format the text of each document"""
|
||||
return text.replace("\n", " ")
|
||||
|
||||
def _format_retrieved_context(self, documents: list[RetrievedDocument]) -> str:
|
||||
"""Format the texts between all documents"""
|
||||
matched_texts: list[str] = [
|
||||
self._format_doc_text(doc.text) for doc in documents
|
||||
]
|
||||
return "\n\n".join(matched_texts)
|
||||
|
||||
def run(
|
||||
self,
|
||||
question: str,
|
||||
documents: list[RetrievedDocument],
|
||||
use_citation: bool = False,
|
||||
**kwargs
|
||||
) -> Document:
|
||||
# retrieve relevant documents as context
|
||||
context = self._format_retrieved_context(documents)
|
||||
self.log_progress(".context", context=context)
|
||||
|
||||
# generate the answer
|
||||
prompt = self.qa_prompt_template.populate(
|
||||
context=context,
|
||||
question=question,
|
||||
)
|
||||
self.log_progress(".prompt", prompt=prompt)
|
||||
answer_text = self.llm(prompt).text
|
||||
if use_citation:
|
||||
# run citation pipeline
|
||||
citation_pipeline = CitationPipeline(llm=self.llm)
|
||||
citation = citation_pipeline(context=context, question=question)
|
||||
else:
|
||||
citation = None
|
||||
|
||||
answer = Document(text=answer_text, metadata={"citation": citation})
|
||||
return answer
|
@@ -1,4 +1,4 @@
|
||||
from ..base import DocTransformer, LlamaIndexMixin
|
||||
from ..base import DocTransformer, LlamaIndexDocTransformerMixin
|
||||
|
||||
|
||||
class BaseSplitter(DocTransformer):
|
||||
@@ -7,14 +7,14 @@ class BaseSplitter(DocTransformer):
|
||||
...
|
||||
|
||||
|
||||
class TokenSplitter(LlamaIndexMixin, BaseSplitter):
|
||||
class TokenSplitter(LlamaIndexDocTransformerMixin, BaseSplitter):
|
||||
def _get_li_class(self):
|
||||
from llama_index.text_splitter import TokenTextSplitter
|
||||
|
||||
return TokenTextSplitter
|
||||
|
||||
|
||||
class SentenceWindowSplitter(LlamaIndexMixin, BaseSplitter):
|
||||
class SentenceWindowSplitter(LlamaIndexDocTransformerMixin, BaseSplitter):
|
||||
def _get_li_class(self):
|
||||
from llama_index.node_parser import SentenceWindowNodeParser
|
||||
|
||||
|
185
knowledgehub/indices/vectorindex.py
Normal file
185
knowledgehub/indices/vectorindex.py
Normal file
@@ -0,0 +1,185 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional, Sequence, cast
|
||||
|
||||
from kotaemon.base import BaseComponent, Document, RetrievedDocument
|
||||
from kotaemon.embeddings import BaseEmbeddings
|
||||
from kotaemon.storages import BaseDocumentStore, BaseVectorStore
|
||||
|
||||
from .base import BaseIndexing, BaseRetrieval
|
||||
from .rankings import BaseReranking
|
||||
|
||||
VECTOR_STORE_FNAME = "vectorstore"
|
||||
DOC_STORE_FNAME = "docstore"
|
||||
|
||||
|
||||
class VectorIndexing(BaseIndexing):
|
||||
"""Ingest the document, run through the embedding, and store the embedding in a
|
||||
vector store.
|
||||
|
||||
This pipeline supports the following set of inputs:
|
||||
- List of documents
|
||||
- List of texts
|
||||
"""
|
||||
|
||||
vector_store: BaseVectorStore
|
||||
doc_store: Optional[BaseDocumentStore] = None
|
||||
embedding: BaseEmbeddings
|
||||
|
||||
def to_retrieval_pipeline(self, *args, **kwargs):
|
||||
"""Convert the indexing pipeline to a retrieval pipeline"""
|
||||
return VectorRetrieval(
|
||||
vector_store=self.vector_store,
|
||||
doc_store=self.doc_store,
|
||||
embedding=self.embedding,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def to_qa_pipeline(self, *args, **kwargs):
|
||||
from .qa import CitationQAPipeline
|
||||
|
||||
return TextVectorQA(
|
||||
retrieving_pipeline=self.to_retrieval_pipeline(**kwargs),
|
||||
qa_pipeline=CitationQAPipeline(**kwargs),
|
||||
)
|
||||
|
||||
def run(self, text: str | list[str] | Document | list[Document]) -> None:
|
||||
input_: list[Document] = []
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
for item in cast(list, text):
|
||||
if isinstance(item, str):
|
||||
input_.append(Document(text=item, id_=str(uuid.uuid4())))
|
||||
elif isinstance(item, Document):
|
||||
input_.append(item)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid input type {type(item)}, should be str or Document"
|
||||
)
|
||||
|
||||
embeddings = self.embedding(input_)
|
||||
self.vector_store.add(
|
||||
embeddings=embeddings,
|
||||
ids=[t.id_ for t in input_],
|
||||
)
|
||||
if self.doc_store:
|
||||
self.doc_store.add(input_)
|
||||
|
||||
def save(
|
||||
self,
|
||||
path: str | Path,
|
||||
vectorstore_fname: str = VECTOR_STORE_FNAME,
|
||||
docstore_fname: str = DOC_STORE_FNAME,
|
||||
):
|
||||
"""Save the whole state of the indexing pipeline vector store and all
|
||||
necessary information to disk
|
||||
|
||||
Args:
|
||||
path (str): path to save the state
|
||||
"""
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
self.vector_store.save(path / vectorstore_fname)
|
||||
if self.doc_store:
|
||||
self.doc_store.save(path / docstore_fname)
|
||||
|
||||
def load(
|
||||
self,
|
||||
path: str | Path,
|
||||
vectorstore_fname: str = VECTOR_STORE_FNAME,
|
||||
docstore_fname: str = DOC_STORE_FNAME,
|
||||
):
|
||||
"""Load all information from disk to an object"""
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
self.vector_store.load(path / vectorstore_fname)
|
||||
if self.doc_store:
|
||||
self.doc_store.load(path / docstore_fname)
|
||||
|
||||
|
||||
class VectorRetrieval(BaseRetrieval):
|
||||
"""Retrieve list of documents from vector store"""
|
||||
|
||||
vector_store: BaseVectorStore
|
||||
doc_store: Optional[BaseDocumentStore] = None
|
||||
embedding: BaseEmbeddings
|
||||
rerankers: Sequence[BaseReranking] = []
|
||||
top_k: int = 1
|
||||
|
||||
def run(
|
||||
self, text: str | Document, top_k: Optional[int] = None, **kwargs
|
||||
) -> list[RetrievedDocument]:
|
||||
"""Retrieve a list of documents from vector store
|
||||
|
||||
Args:
|
||||
text: the text to retrieve similar documents
|
||||
top_k: number of top similar documents to return
|
||||
|
||||
Returns:
|
||||
list[RetrievedDocument]: list of retrieved documents
|
||||
"""
|
||||
if top_k is None:
|
||||
top_k = self.top_k
|
||||
|
||||
if self.doc_store is None:
|
||||
raise ValueError(
|
||||
"doc_store is not provided. Please provide a doc_store to "
|
||||
"retrieve the documents"
|
||||
)
|
||||
|
||||
emb: list[float] = self.embedding(text)[0].embedding
|
||||
_, scores, ids = self.vector_store.query(embedding=emb, top_k=top_k)
|
||||
docs = self.doc_store.get(ids)
|
||||
result = [
|
||||
RetrievedDocument(**doc.to_dict(), score=score)
|
||||
for doc, score in zip(docs, scores)
|
||||
]
|
||||
# use additional reranker to re-order the document list
|
||||
if self.rerankers:
|
||||
for reranker in self.rerankers:
|
||||
result = reranker(documents=result, query=text)
|
||||
|
||||
return result
|
||||
|
||||
def save(
|
||||
self,
|
||||
path: str | Path,
|
||||
vectorstore_fname: str = VECTOR_STORE_FNAME,
|
||||
docstore_fname: str = DOC_STORE_FNAME,
|
||||
):
|
||||
"""Save the whole state of the indexing pipeline vector store and all
|
||||
necessary information to disk
|
||||
|
||||
Args:
|
||||
path (str): path to save the state
|
||||
"""
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
self.vector_store.save(path / vectorstore_fname)
|
||||
if self.doc_store:
|
||||
self.doc_store.save(path / docstore_fname)
|
||||
|
||||
def load(
|
||||
self,
|
||||
path: str | Path,
|
||||
vectorstore_fname: str = VECTOR_STORE_FNAME,
|
||||
docstore_fname: str = DOC_STORE_FNAME,
|
||||
):
|
||||
"""Load all information from disk to an object"""
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
self.vector_store.load(path / vectorstore_fname)
|
||||
if self.doc_store:
|
||||
self.doc_store.load(path / docstore_fname)
|
||||
|
||||
|
||||
class TextVectorQA(BaseComponent):
|
||||
retrieving_pipeline: BaseRetrieval
|
||||
qa_pipeline: BaseComponent
|
||||
|
||||
def run(self, question, **kwargs):
|
||||
retrieved_documents = self.retrieving_pipeline(question, **kwargs)
|
||||
return self.qa_pipeline(question, retrieved_documents, **kwargs)
|
@@ -1,3 +1,5 @@
|
||||
from typing import Union
|
||||
|
||||
from kotaemon.base.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
|
||||
|
||||
from .branching import GatedBranchingPipeline, SimpleBranchingPipeline
|
||||
@@ -6,7 +8,11 @@ from .completions import LLM, AzureOpenAI, OpenAI
|
||||
from .linear import GatedLinearPipeline, SimpleLinearPipeline
|
||||
from .prompts import BasePromptComponent, PromptTemplate
|
||||
|
||||
BaseLLM = Union[ChatLLM, LLM]
|
||||
|
||||
|
||||
__all__ = [
|
||||
"BaseLLM",
|
||||
# chat-specific components
|
||||
"ChatLLM",
|
||||
"BaseMessage",
|
||||
|
@@ -4,8 +4,10 @@ from typing import Callable, List
|
||||
from theflow import Function, Node, Param
|
||||
|
||||
from kotaemon.base import BaseComponent, Document
|
||||
from kotaemon.llms import LLM, BasePromptComponent
|
||||
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
||||
|
||||
from .chats import AzureChatOpenAI
|
||||
from .completions import LLM
|
||||
from .prompts import BasePromptComponent
|
||||
|
||||
|
||||
class Thought(BaseComponent):
|
||||
@@ -146,7 +148,7 @@ class ManualSequentialChainOfThought(BaseComponent):
|
||||
thoughts: List[Thought] = Param(
|
||||
default_callback=lambda *_: [], help="List of Thought"
|
||||
)
|
||||
llm: LLM = Param(help="The LLM model to use (base of kotaemon.llms.LLM)")
|
||||
llm: LLM = Param(help="The LLM model to use (base of kotaemon.llms.BaseLLM)")
|
||||
terminate: Callable = Param(
|
||||
default=lambda _: False,
|
||||
help="Callback on terminate condition. Default to always return False",
|
@@ -1,81 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional, cast
|
||||
|
||||
from ..base import BaseComponent, Document
|
||||
from ..embeddings import BaseEmbeddings
|
||||
from ..storages import BaseDocumentStore, BaseVectorStore
|
||||
|
||||
VECTOR_STORE_FNAME = "vectorstore"
|
||||
DOC_STORE_FNAME = "docstore"
|
||||
|
||||
|
||||
class IndexVectorStoreFromDocumentPipeline(BaseComponent):
|
||||
"""Ingest the document, run through the embedding, and store the embedding in a
|
||||
vector store.
|
||||
|
||||
This pipeline supports the following set of inputs:
|
||||
- List of documents
|
||||
- List of texts
|
||||
"""
|
||||
|
||||
vector_store: BaseVectorStore
|
||||
doc_store: Optional[BaseDocumentStore] = None
|
||||
embedding: BaseEmbeddings
|
||||
# TODO: refer to llama_index's storage as well
|
||||
|
||||
def run(self, text: str | list[str] | Document | list[Document]) -> None:
|
||||
input_: list[Document] = []
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
for item in cast(list, text):
|
||||
if isinstance(item, str):
|
||||
input_.append(Document(text=item, id_=str(uuid.uuid4())))
|
||||
elif isinstance(item, Document):
|
||||
input_.append(item)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid input type {type(item)}, should be str or Document"
|
||||
)
|
||||
|
||||
embeddings = self.embedding(input_)
|
||||
self.vector_store.add(
|
||||
embeddings=embeddings,
|
||||
ids=[t.id_ for t in input_],
|
||||
)
|
||||
if self.doc_store:
|
||||
self.doc_store.add(input_)
|
||||
|
||||
def save(
|
||||
self,
|
||||
path: str | Path,
|
||||
vectorstore_fname: str = VECTOR_STORE_FNAME,
|
||||
docstore_fname: str = DOC_STORE_FNAME,
|
||||
):
|
||||
"""Save the whole state of the indexing pipeline vector store and all
|
||||
necessary information to disk
|
||||
|
||||
Args:
|
||||
path (str): path to save the state
|
||||
"""
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
self.vector_store.save(path / vectorstore_fname)
|
||||
if self.doc_store:
|
||||
self.doc_store.save(path / docstore_fname)
|
||||
|
||||
def load(
|
||||
self,
|
||||
path: str | Path,
|
||||
vectorstore_fname: str = VECTOR_STORE_FNAME,
|
||||
docstore_fname: str = DOC_STORE_FNAME,
|
||||
):
|
||||
"""Load all information from disk to an object"""
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
self.vector_store.load(path / vectorstore_fname)
|
||||
if self.doc_store:
|
||||
self.doc_store.load(path / docstore_fname)
|
@@ -1,159 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional, Sequence
|
||||
|
||||
from llama_index.readers.base import BaseReader
|
||||
from theflow import Node
|
||||
from theflow.utils.modules import ObjectInitDeclaration as _
|
||||
|
||||
from kotaemon.agents import BaseAgent
|
||||
from kotaemon.base import BaseComponent
|
||||
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||
from kotaemon.indices.extractors import BaseDocParser
|
||||
from kotaemon.indices.rankings import BaseReranking
|
||||
from kotaemon.indices.splitters import TokenSplitter
|
||||
from kotaemon.loaders import (
|
||||
AutoReader,
|
||||
DirectoryReader,
|
||||
MathpixPDFReader,
|
||||
OCRReader,
|
||||
PandasExcelReader,
|
||||
)
|
||||
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
|
||||
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||
from kotaemon.storages import (
|
||||
BaseDocumentStore,
|
||||
BaseVectorStore,
|
||||
InMemoryDocumentStore,
|
||||
InMemoryVectorStore,
|
||||
)
|
||||
|
||||
from .qa import AgentQAPipeline, QuestionAnsweringPipeline
|
||||
from .utils import file_names_to_collection_name
|
||||
|
||||
|
||||
class ReaderIndexingPipeline(BaseComponent):
|
||||
"""
|
||||
Indexing pipeline which takes input from list of files
|
||||
and perform ingestion to vectorstore
|
||||
"""
|
||||
|
||||
# Expose variables for users to switch in prompt ui
|
||||
storage_path: Path = Path("./storage")
|
||||
reader_name: str = "normal" # "normal", "mathpix" or "ocr"
|
||||
chunk_size: int = 1024
|
||||
chunk_overlap: int = 256
|
||||
vector_store: BaseVectorStore = _(InMemoryVectorStore)
|
||||
doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
|
||||
doc_parsers: list[BaseDocParser] = []
|
||||
|
||||
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
|
||||
model="text-embedding-ada-002",
|
||||
deployment="dummy-q2-text-embedding",
|
||||
azure_endpoint="https://bleh-dummy.openai.azure.com/",
|
||||
openai_api_key=os.environ.get("OPENAI_API_KEY", ""),
|
||||
chunk_size=16,
|
||||
)
|
||||
|
||||
def get_reader(self, input_files: list[str | Path]):
|
||||
# document parsers
|
||||
file_extractor: dict[str, BaseReader | AutoReader] = {
|
||||
".xlsx": PandasExcelReader(),
|
||||
}
|
||||
if self.reader_name == "normal":
|
||||
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
|
||||
elif self.reader_name == "ocr":
|
||||
file_extractor[".pdf"] = OCRReader()
|
||||
else:
|
||||
file_extractor[".pdf"] = MathpixPDFReader()
|
||||
main_reader = DirectoryReader(
|
||||
input_files=input_files,
|
||||
file_extractor=file_extractor,
|
||||
)
|
||||
return main_reader
|
||||
|
||||
@Node.auto(depends_on=["doc_store", "vector_store", "embedding"])
|
||||
def indexing_vector_pipeline(self):
|
||||
return IndexVectorStoreFromDocumentPipeline(
|
||||
doc_store=self.doc_store,
|
||||
vector_store=self.vector_store,
|
||||
embedding=self.embedding,
|
||||
)
|
||||
|
||||
@Node.auto(depends_on=["chunk_size", "chunk_overlap"])
|
||||
def text_splitter(self) -> TokenSplitter:
|
||||
return TokenSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
)
|
||||
|
||||
def run(
|
||||
self,
|
||||
file_path_list: list[str | Path] | str | Path,
|
||||
force_reindex: Optional[bool] = False,
|
||||
):
|
||||
self.storage_path.mkdir(exist_ok=True)
|
||||
|
||||
if not isinstance(file_path_list, list):
|
||||
file_path_list = [file_path_list]
|
||||
|
||||
self.file_name_list = [Path(path).stem for path in file_path_list]
|
||||
collection_name = file_names_to_collection_name(self.file_name_list)
|
||||
|
||||
file_storage_path = self.storage_path / collection_name
|
||||
|
||||
# skip indexing if storage path exist
|
||||
if force_reindex or not file_storage_path.exists():
|
||||
file_storage_path.mkdir(exist_ok=True)
|
||||
# reader call
|
||||
documents = self.get_reader(input_files=file_path_list)()
|
||||
nodes = self.text_splitter(documents)
|
||||
self.log_progress(".num_docs", num_docs=len(nodes))
|
||||
|
||||
# document parsers call
|
||||
if self.doc_parsers:
|
||||
for parser in self.doc_parsers:
|
||||
nodes = parser(nodes)
|
||||
|
||||
self.indexing_vector_pipeline(nodes)
|
||||
# persist right after indexing
|
||||
self.indexing_vector_pipeline.save(file_storage_path)
|
||||
else:
|
||||
self.indexing_vector_pipeline.load(file_storage_path)
|
||||
|
||||
def to_retrieving_pipeline(self, top_k=3, rerankers: Sequence[BaseReranking] = []):
|
||||
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
|
||||
vector_store=self.vector_store,
|
||||
doc_store=self.doc_store,
|
||||
embedding=self.embedding,
|
||||
top_k=top_k,
|
||||
rerankers=rerankers,
|
||||
)
|
||||
return retrieving_pipeline
|
||||
|
||||
def to_qa_pipeline(self, llm: BaseComponent, **kwargs):
|
||||
qa_pipeline = QuestionAnsweringPipeline(
|
||||
storage_path=self.storage_path,
|
||||
file_name_list=self.file_name_list,
|
||||
vector_store=self.vector_store,
|
||||
doc_store=self.doc_store,
|
||||
embedding=self.embedding,
|
||||
llm=llm,
|
||||
**kwargs,
|
||||
)
|
||||
return qa_pipeline
|
||||
|
||||
def to_agent_pipeline(self, agent: BaseAgent, **kwargs):
|
||||
agent_pipeline = AgentQAPipeline(
|
||||
storage_path=self.storage_path,
|
||||
file_name_list=self.file_name_list,
|
||||
vector_store=self.vector_store,
|
||||
doc_store=self.doc_store,
|
||||
embedding=self.embedding,
|
||||
agent=agent,
|
||||
**kwargs,
|
||||
)
|
||||
agent_pipeline.add_search_tool()
|
||||
return agent_pipeline
|
@@ -1,145 +0,0 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Sequence
|
||||
|
||||
from theflow import Node
|
||||
from theflow.utils.modules import ObjectInitDeclaration as _
|
||||
|
||||
from kotaemon.agents import BaseAgent
|
||||
from kotaemon.agents.tools import ComponentTool
|
||||
from kotaemon.base import BaseComponent
|
||||
from kotaemon.base.schema import Document, RetrievedDocument
|
||||
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||
from kotaemon.indices.rankings import BaseReranking
|
||||
from kotaemon.llms import PromptTemplate
|
||||
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
||||
from kotaemon.pipelines.citation import CitationPipeline
|
||||
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||
from kotaemon.storages import (
|
||||
BaseDocumentStore,
|
||||
BaseVectorStore,
|
||||
InMemoryDocumentStore,
|
||||
InMemoryVectorStore,
|
||||
)
|
||||
|
||||
from .utils import file_names_to_collection_name
|
||||
|
||||
|
||||
class QuestionAnsweringPipeline(BaseComponent):
|
||||
"""
|
||||
Question Answering pipeline ultilizing a child Retrieving pipeline
|
||||
"""
|
||||
|
||||
storage_path: Path = Path("./storage")
|
||||
retrieval_top_k: int = 3
|
||||
file_name_list: List[str]
|
||||
"""List of filename, incombination with storage_path to
|
||||
create persistent path of vectorstore"""
|
||||
qa_prompt_template: PromptTemplate = PromptTemplate(
|
||||
'Answer the following question: "{question}". '
|
||||
"The context is: \n{context}\nAnswer: "
|
||||
)
|
||||
|
||||
llm: AzureChatOpenAI = AzureChatOpenAI.withx(
|
||||
azure_endpoint="https://bleh-dummy.openai.azure.com/",
|
||||
openai_api_key=os.environ.get("OPENAI_API_KEY", ""),
|
||||
openai_api_version="2023-07-01-preview",
|
||||
deployment_name="dummy-q2-16k",
|
||||
temperature=0,
|
||||
request_timeout=60,
|
||||
)
|
||||
|
||||
vector_store: BaseVectorStore = _(InMemoryVectorStore)
|
||||
doc_store: BaseDocumentStore = _(InMemoryDocumentStore)
|
||||
rerankers: Sequence[BaseReranking] = []
|
||||
|
||||
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
|
||||
model="text-embedding-ada-002",
|
||||
deployment="dummy-q2-text-embedding",
|
||||
azure_endpoint="https://bleh-dummy.openai.azure.com/",
|
||||
openai_api_key=os.environ.get("OPENAI_API_KEY", ""),
|
||||
)
|
||||
|
||||
@Node.auto(
|
||||
depends_on=[
|
||||
"vector_store",
|
||||
"doc_store",
|
||||
"embedding",
|
||||
"file_name_list",
|
||||
"retrieval_top_k",
|
||||
]
|
||||
)
|
||||
def retrieving_pipeline(self) -> RetrieveDocumentFromVectorStorePipeline:
|
||||
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
|
||||
vector_store=self.vector_store,
|
||||
doc_store=self.doc_store,
|
||||
embedding=self.embedding,
|
||||
top_k=self.retrieval_top_k,
|
||||
rerankers=self.rerankers,
|
||||
)
|
||||
# load persistent from selected path
|
||||
collection_name = file_names_to_collection_name(self.file_name_list)
|
||||
retrieving_pipeline.load(self.storage_path / collection_name)
|
||||
return retrieving_pipeline
|
||||
|
||||
def _format_doc_text(self, text: str) -> str:
|
||||
return text.replace("\n", " ")
|
||||
|
||||
def _format_retrieved_context(self, documents: List[RetrievedDocument]) -> str:
|
||||
matched_texts: List[str] = [
|
||||
self._format_doc_text(doc.text) for doc in documents
|
||||
]
|
||||
return "\n\n".join(matched_texts)
|
||||
|
||||
def run(self, question: str, use_citation: bool = False) -> Document:
|
||||
# retrieve relevant documents as context
|
||||
documents = self.retrieving_pipeline(question, top_k=int(self.retrieval_top_k))
|
||||
context = self._format_retrieved_context(documents)
|
||||
self.log_progress(".context", context=context)
|
||||
|
||||
# generate the answer
|
||||
prompt = self.qa_prompt_template.populate(
|
||||
context=context,
|
||||
question=question,
|
||||
)
|
||||
self.log_progress(".prompt", prompt=prompt)
|
||||
answer_text = self.llm(prompt).text
|
||||
if use_citation:
|
||||
# run citation pipeline
|
||||
citation_pipeline = CitationPipeline(llm=self.llm)
|
||||
citation = citation_pipeline(context=context, question=question)
|
||||
else:
|
||||
citation = None
|
||||
|
||||
answer = Document(text=answer_text, metadata={"citation": citation})
|
||||
return answer
|
||||
|
||||
|
||||
class AgentQAPipeline(QuestionAnsweringPipeline):
|
||||
"""
|
||||
QA pipeline ultilizing a child Retrieving pipeline and a Agent pipeline
|
||||
"""
|
||||
|
||||
agent: BaseAgent
|
||||
|
||||
def add_search_tool(self):
|
||||
search_tool = ComponentTool(
|
||||
name="search_doc",
|
||||
description=(
|
||||
"A vector store that searches for similar and "
|
||||
"related content "
|
||||
f"in a document: {' '.join(self.file_name_list)}. "
|
||||
"The result is a huge chunk of text related "
|
||||
"to your search but can also "
|
||||
"contain irrelevant info."
|
||||
),
|
||||
postprocessor=self._format_retrieved_context,
|
||||
component=self.retrieving_pipeline,
|
||||
)
|
||||
if search_tool not in self.agent.plugins:
|
||||
self.agent.add_tools([search_tool])
|
||||
|
||||
def run(self, question: str, use_citation: bool = False) -> Document:
|
||||
kwargs = {"use_citation": use_citation} if use_citation else {}
|
||||
answer = self.agent(question, **kwargs)
|
||||
return answer
|
@@ -1,87 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, Sequence
|
||||
|
||||
from kotaemon.base import BaseComponent, Document, RetrievedDocument
|
||||
from kotaemon.embeddings import BaseEmbeddings
|
||||
from kotaemon.indices.rankings import BaseReranking
|
||||
from kotaemon.storages import BaseDocumentStore, BaseVectorStore
|
||||
|
||||
VECTOR_STORE_FNAME = "vectorstore"
|
||||
DOC_STORE_FNAME = "docstore"
|
||||
|
||||
|
||||
class RetrieveDocumentFromVectorStorePipeline(BaseComponent):
|
||||
"""Retrieve list of documents from vector store"""
|
||||
|
||||
vector_store: BaseVectorStore
|
||||
doc_store: BaseDocumentStore
|
||||
embedding: BaseEmbeddings
|
||||
rerankers: Sequence[BaseReranking] = []
|
||||
top_k: int = 1
|
||||
# TODO: refer to llama_index's storage as well
|
||||
|
||||
def run(
|
||||
self, text: str | Document, top_k: Optional[int] = None
|
||||
) -> list[RetrievedDocument]:
|
||||
"""Retrieve a list of documents from vector store
|
||||
|
||||
Args:
|
||||
text: the text to retrieve similar documents
|
||||
top_k: number of top similar documents to return
|
||||
|
||||
Returns:
|
||||
list[RetrievedDocument]: list of retrieved documents
|
||||
"""
|
||||
if top_k is None:
|
||||
top_k = self.top_k
|
||||
|
||||
if self.doc_store is None:
|
||||
raise ValueError(
|
||||
"doc_store is not provided. Please provide a doc_store to "
|
||||
"retrieve the documents"
|
||||
)
|
||||
|
||||
emb: list[float] = self.embedding(text)[0].embedding
|
||||
_, scores, ids = self.vector_store.query(embedding=emb, top_k=top_k)
|
||||
docs = self.doc_store.get(ids)
|
||||
result = [
|
||||
RetrievedDocument(**doc.to_dict(), score=score)
|
||||
for doc, score in zip(docs, scores)
|
||||
]
|
||||
# use additional reranker to re-order the document list
|
||||
if self.rerankers:
|
||||
for reranker in self.rerankers:
|
||||
result = reranker(documents=result, query=text)
|
||||
|
||||
return result
|
||||
|
||||
def save(
|
||||
self,
|
||||
path: str | Path,
|
||||
vectorstore_fname: str = VECTOR_STORE_FNAME,
|
||||
docstore_fname: str = DOC_STORE_FNAME,
|
||||
):
|
||||
"""Save the whole state of the indexing pipeline vector store and all
|
||||
necessary information to disk
|
||||
|
||||
Args:
|
||||
path (str): path to save the state
|
||||
"""
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
self.vector_store.save(path / vectorstore_fname)
|
||||
self.doc_store.save(path / docstore_fname)
|
||||
|
||||
def load(
|
||||
self,
|
||||
path: str | Path,
|
||||
vectorstore_fname: str = VECTOR_STORE_FNAME,
|
||||
docstore_fname: str = DOC_STORE_FNAME,
|
||||
):
|
||||
"""Load all information from disk to an object"""
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
self.vector_store.load(path / vectorstore_fname)
|
||||
self.doc_store.load(path / docstore_fname)
|
@@ -1,17 +0,0 @@
|
||||
import hashlib
|
||||
from typing import List
|
||||
|
||||
|
||||
def filename_to_hash(filename: str) -> str:
|
||||
"""
|
||||
Convert filename to hash to be used as collection name for storage
|
||||
"""
|
||||
result = hashlib.md5(filename.encode())
|
||||
return result.hexdigest()
|
||||
|
||||
|
||||
def file_names_to_collection_name(file_name_list: List[str]) -> str:
|
||||
"""
|
||||
Convert list of filenames to collection name
|
||||
"""
|
||||
return filename_to_hash(" ".join(file_name_list))
|
Reference in New Issue
Block a user