kotaemon/knowledgehub/pipelines/indexing.py
Tuan Anh Nguyen Dang (Tadashi_Cin) 56bc41b673 Update Base interface of Index/Retrieval pipeline (#36)
* add base Tool

* minor update test_tool

* update test dependency

* update test dependency

* Fix namespace conflict

* update test

* add base Agent Interface, add ReWoo Agent

* minor update

* update test

* fix typo

* remove unneeded print

* update rewoo agent

* add LLMTool

* update BaseAgent type

* add ReAct agent

* add ReAct agent

* minor update

* minor update

* minor update

* minor update

* update base reader with BaseComponent

* add splitter

* update agent and tool

* update vectorstores

* update load/save for indexing and retrieving pipeline

* update test_agent for more use-cases

* add missing dependency for test

* update test case for in memory vectorstore

* add TextSplitter to BaseComponent

* update type hint basetool

---------

Co-authored-by: trducng <trungduc1992@gmail.com>
2023-10-04 14:27:44 +07:00

92 lines
2.7 KiB
Python

import uuid
from pathlib import Path
from typing import List, Union
from theflow import Node, Param
from ..base import BaseComponent
from ..docstores import BaseDocumentStore
from ..documents.base import Document
from ..embeddings import BaseEmbeddings
from ..vectorstores import BaseVectorStore
VECTOR_STORE_FNAME = "vectorstore"
DOC_STORE_FNAME = "docstore"
class IndexVectorStoreFromDocumentPipeline(BaseComponent):
"""Ingest the document, run through the embedding, and store the embedding in a
vector store.
This pipeline supports the following set of inputs:
- List of documents
- List of texts
"""
vector_store: Param[BaseVectorStore] = Param()
doc_store: Param[BaseDocumentStore] = Param()
embedding: Node[BaseEmbeddings] = Node()
# TODO: refer to llama_index's storage as well
def run_raw(self, text: str) -> None:
document = Document(text=text, id_=str(uuid.uuid4()))
self.run_batch_document([document])
def run_batch_raw(self, text: List[str]) -> None:
documents = [Document(text=t, id_=str(uuid.uuid4())) for t in text]
self.run_batch_document(documents)
def run_document(self, text: Document) -> None:
self.run_batch_document([text])
def run_batch_document(self, text: List[Document]) -> None:
embeddings = self.embedding(text)
self.vector_store.add(
embeddings=embeddings,
ids=[t.id_ for t in text],
)
if self.doc_store:
self.doc_store.add(text)
def is_document(self, text) -> bool:
if isinstance(text, Document):
return True
elif isinstance(text, List) and isinstance(text[0], Document):
return True
return False
def is_batch(self, text) -> bool:
if isinstance(text, list):
return True
return False
def save(
self,
path: Union[str, Path],
vectorstore_fname: str = VECTOR_STORE_FNAME,
docstore_fname: str = DOC_STORE_FNAME,
):
"""Save the whole state of the indexing pipeline vector store and all
necessary information to disk
Args:
path (str): path to save the state
"""
if isinstance(path, str):
path = Path(path)
self.vector_store.save(path / vectorstore_fname)
self.doc_store.save(path / docstore_fname)
def load(
self,
path: Union[str, Path],
vectorstore_fname: str = VECTOR_STORE_FNAME,
docstore_fname: str = DOC_STORE_FNAME,
):
"""Load all information from disk to an object"""
if isinstance(path, str):
path = Path(path)
self.vector_store.load(path / vectorstore_fname)
self.doc_store.load(path / docstore_fname)