Reduce the default chunk size in the reasoning pipeline to fit LLM capability

This commit is contained in:
trducng 2024-02-03 09:38:50 +07:00
parent 107bc7580e
commit bff55230ba
4 changed files with 20 additions and 6 deletions

View File

@ -7,10 +7,12 @@ repos:
- id: check-toml - id: check-toml
- id: end-of-file-fixer - id: end-of-file-fixer
- id: trailing-whitespace - id: trailing-whitespace
- id: mixed-line-ending
- id: detect-aws-credentials - id: detect-aws-credentials
args: ["--allow-missing-credentials"] args: ["--allow-missing-credentials"]
- id: detect-private-key - id: detect-private-key
- id: check-added-large-files - id: check-added-large-files
- id: debug-statements
- repo: https://github.com/ambv/black - repo: https://github.com/ambv/black
rev: 22.3.0 rev: 22.3.0
hooks: hooks:

View File

@ -166,7 +166,7 @@ class DocumentRetrievalPipeline(BaseRetriever):
) )
table_pages[doc.metadata["file_name"]].append(doc.metadata["page_label"]) table_pages[doc.metadata["file_name"]].append(doc.metadata["page_label"])
queries = [ queries: list[dict] = [
{"$and": [{"file_name": {"$eq": fn}}, {"page_label": {"$in": pls}}]} {"$and": [{"file_name": {"$eq": fn}}, {"page_label": {"$in": pls}}]}
for fn, pls in table_pages.items() for fn, pls in table_pages.items()
] ]
@ -174,7 +174,7 @@ class DocumentRetrievalPipeline(BaseRetriever):
extra_docs = self.vector_retrieval( extra_docs = self.vector_retrieval(
text="", text="",
top_k=50, top_k=50,
where={"$or": queries}, where=queries[0] if len(queries) == 1 else {"$or": queries},
) )
for doc in extra_docs: for doc in extra_docs:
if doc.doc_id not in retrieved_id: if doc.doc_id not in retrieved_id:

View File

@ -6,7 +6,15 @@ class FileUpload(BasePage):
def __init__(self, app): def __init__(self, app):
self._app = app self._app = app
self._supported_file_types = [ self._supported_file_types = [
"image", ".pdf", ".txt", ".csv", ".xlsx", ".doc", ".docx", ".pptx", ".html" "image",
".pdf",
".txt",
".csv",
".xlsx",
".doc",
".docx",
".pptx",
".html",
] ]
self.on_building_ui() self.on_building_ui()

View File

@ -33,7 +33,7 @@ class PrepareEvidencePipeline(BaseComponent):
""" """
trim_func: TokenSplitter = TokenSplitter.withx( trim_func: TokenSplitter = TokenSplitter.withx(
chunk_size=7600, chunk_size=3000,
chunk_overlap=0, chunk_overlap=0,
separator=" ", separator=" ",
tokenizer=partial( tokenizer=partial(
@ -232,8 +232,12 @@ class FullQAPipeline(BaseComponent):
self, message: str, conv_id: str, history: list, **kwargs # type: ignore self, message: str, conv_id: str, history: list, **kwargs # type: ignore
) -> Document: # type: ignore ) -> Document: # type: ignore
docs = [] docs = []
doc_ids = []
for retriever in self.retrievers: for retriever in self.retrievers:
docs.extend(retriever(text=message)) for doc in retriever(text=message):
if doc.doc_id not in doc_ids:
docs.append(doc)
doc_ids.append(doc.doc_id)
evidence_mode, evidence = self.evidence_pipeline(docs).content evidence_mode, evidence = self.evidence_pipeline(docs).content
answer = await self.answering_pipeline( answer = await self.answering_pipeline(
question=message, evidence=evidence, evidence_mode=evidence_mode question=message, evidence=evidence, evidence_mode=evidence_mode
@ -287,7 +291,7 @@ class FullQAPipeline(BaseComponent):
if not_detected: if not_detected:
self.report_output( self.report_output(
{"evidence": "Retrieved docs without matching evidence:\n"} {"evidence": "Retrieved segments without matching evidence:\n"}
) )
for id in list(not_detected): for id in list(not_detected):
self.report_output( self.report_output(