Reduce the default chunk size in the reasoning pipeline to fit LLM capability
This commit is contained in:
parent
107bc7580e
commit
bff55230ba
|
@ -7,10 +7,12 @@ repos:
|
||||||
- id: check-toml
|
- id: check-toml
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
|
- id: mixed-line-ending
|
||||||
- id: detect-aws-credentials
|
- id: detect-aws-credentials
|
||||||
args: ["--allow-missing-credentials"]
|
args: ["--allow-missing-credentials"]
|
||||||
- id: detect-private-key
|
- id: detect-private-key
|
||||||
- id: check-added-large-files
|
- id: check-added-large-files
|
||||||
|
- id: debug-statements
|
||||||
- repo: https://github.com/ambv/black
|
- repo: https://github.com/ambv/black
|
||||||
rev: 22.3.0
|
rev: 22.3.0
|
||||||
hooks:
|
hooks:
|
||||||
|
|
|
@ -166,7 +166,7 @@ class DocumentRetrievalPipeline(BaseRetriever):
|
||||||
)
|
)
|
||||||
table_pages[doc.metadata["file_name"]].append(doc.metadata["page_label"])
|
table_pages[doc.metadata["file_name"]].append(doc.metadata["page_label"])
|
||||||
|
|
||||||
queries = [
|
queries: list[dict] = [
|
||||||
{"$and": [{"file_name": {"$eq": fn}}, {"page_label": {"$in": pls}}]}
|
{"$and": [{"file_name": {"$eq": fn}}, {"page_label": {"$in": pls}}]}
|
||||||
for fn, pls in table_pages.items()
|
for fn, pls in table_pages.items()
|
||||||
]
|
]
|
||||||
|
@ -174,7 +174,7 @@ class DocumentRetrievalPipeline(BaseRetriever):
|
||||||
extra_docs = self.vector_retrieval(
|
extra_docs = self.vector_retrieval(
|
||||||
text="",
|
text="",
|
||||||
top_k=50,
|
top_k=50,
|
||||||
where={"$or": queries},
|
where=queries[0] if len(queries) == 1 else {"$or": queries},
|
||||||
)
|
)
|
||||||
for doc in extra_docs:
|
for doc in extra_docs:
|
||||||
if doc.doc_id not in retrieved_id:
|
if doc.doc_id not in retrieved_id:
|
||||||
|
|
|
@ -6,7 +6,15 @@ class FileUpload(BasePage):
|
||||||
def __init__(self, app):
|
def __init__(self, app):
|
||||||
self._app = app
|
self._app = app
|
||||||
self._supported_file_types = [
|
self._supported_file_types = [
|
||||||
"image", ".pdf", ".txt", ".csv", ".xlsx", ".doc", ".docx", ".pptx", ".html"
|
"image",
|
||||||
|
".pdf",
|
||||||
|
".txt",
|
||||||
|
".csv",
|
||||||
|
".xlsx",
|
||||||
|
".doc",
|
||||||
|
".docx",
|
||||||
|
".pptx",
|
||||||
|
".html",
|
||||||
]
|
]
|
||||||
self.on_building_ui()
|
self.on_building_ui()
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ class PrepareEvidencePipeline(BaseComponent):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
trim_func: TokenSplitter = TokenSplitter.withx(
|
trim_func: TokenSplitter = TokenSplitter.withx(
|
||||||
chunk_size=7600,
|
chunk_size=3000,
|
||||||
chunk_overlap=0,
|
chunk_overlap=0,
|
||||||
separator=" ",
|
separator=" ",
|
||||||
tokenizer=partial(
|
tokenizer=partial(
|
||||||
|
@ -232,8 +232,12 @@ class FullQAPipeline(BaseComponent):
|
||||||
self, message: str, conv_id: str, history: list, **kwargs # type: ignore
|
self, message: str, conv_id: str, history: list, **kwargs # type: ignore
|
||||||
) -> Document: # type: ignore
|
) -> Document: # type: ignore
|
||||||
docs = []
|
docs = []
|
||||||
|
doc_ids = []
|
||||||
for retriever in self.retrievers:
|
for retriever in self.retrievers:
|
||||||
docs.extend(retriever(text=message))
|
for doc in retriever(text=message):
|
||||||
|
if doc.doc_id not in doc_ids:
|
||||||
|
docs.append(doc)
|
||||||
|
doc_ids.append(doc.doc_id)
|
||||||
evidence_mode, evidence = self.evidence_pipeline(docs).content
|
evidence_mode, evidence = self.evidence_pipeline(docs).content
|
||||||
answer = await self.answering_pipeline(
|
answer = await self.answering_pipeline(
|
||||||
question=message, evidence=evidence, evidence_mode=evidence_mode
|
question=message, evidence=evidence, evidence_mode=evidence_mode
|
||||||
|
@ -287,7 +291,7 @@ class FullQAPipeline(BaseComponent):
|
||||||
|
|
||||||
if not_detected:
|
if not_detected:
|
||||||
self.report_output(
|
self.report_output(
|
||||||
{"evidence": "Retrieved docs without matching evidence:\n"}
|
{"evidence": "Retrieved segments without matching evidence:\n"}
|
||||||
)
|
)
|
||||||
for id in list(not_detected):
|
for id in list(not_detected):
|
||||||
self.report_output(
|
self.report_output(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user