Fix loaders' file_path and other metadata

This commit is contained in:
trducng 2024-01-27 22:52:46 +07:00
parent c6637ca56e
commit 80ec214107
5 changed files with 22 additions and 24 deletions

View File

@ -44,6 +44,7 @@ class PandasExcelReader(BaseReader):
file: Path,
include_sheetname: bool = False,
sheet_name: Optional[Union[str, int, list]] = None,
extra_info: Optional[dict] = None,
**kwargs,
) -> List[Document]:
"""Parse file and extract values from a specific column.
@ -92,7 +93,7 @@ class PandasExcelReader(BaseReader):
text=self._row_joiner.join(
self._col_joiner.join(sublist) for sublist in text_list
),
metadata={"source": file.stem},
metadata=extra_info or {},
)
]

View File

@ -2,7 +2,7 @@ import json
import re
import time
from pathlib import Path
from typing import Any, Dict, List
from typing import Any, Dict, List, Optional
import requests
from kotaemon.base import Document
@ -138,7 +138,9 @@ class MathpixPDFReader(BaseReader):
contents = re.sub(markup_regex, "", contents)
return contents
def load_data(self, file_path: Path, **kwargs) -> List[Document]:
def load_data(
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
if "response_content" in kwargs:
# overriding response content if specified
content = kwargs["response_content"]
@ -154,10 +156,11 @@ class MathpixPDFReader(BaseReader):
for table in tables:
text = strip_special_chars_markdown(table)
metadata = {
"source": file_path.name,
"table_origin": table,
"type": "table",
}
if extra_info:
metadata.update(extra_info)
documents.append(
Document(
text=text,

View File

@ -1,5 +1,5 @@
from pathlib import Path
from typing import List
from typing import List, Optional
from uuid import uuid4
import requests
@ -25,7 +25,9 @@ class OCRReader(BaseReader):
self.ocr_endpoint = endpoint
self.use_ocr = use_ocr
def load_data(self, file_path: Path, **kwargs) -> List[Document]:
def load_data(
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
"""Load data using OCR reader
Args:
@ -63,6 +65,7 @@ class OCRReader(BaseReader):
debug_path=debug_path,
artifact_path=artifact_path,
)
extra_info = extra_info or {}
# create output Document with metadata from table
documents = [
@ -72,10 +75,7 @@ class OCRReader(BaseReader):
"table_origin": table_text,
"type": "table",
"page_label": page_id + 1,
"source": file_path.name,
"file_path": str(file_path),
"file_name": file_path.name,
"filename": str(file_path),
**extra_info,
},
metadata_template="",
metadata_seperator="",
@ -87,13 +87,7 @@ class OCRReader(BaseReader):
[
Document(
text=non_table_text,
metadata={
"page_label": page_id + 1,
"source": file_path.name,
"file_path": str(file_path),
"file_name": file_path.name,
"filename": str(file_path),
},
metadata={"page_label": page_id + 1, **extra_info},
)
for page_id, non_table_text in texts
]

View File

@ -50,7 +50,7 @@ class UnstructuredReader(BaseReader):
def load_data(
self,
file: Path,
additional_metadata: Optional[Dict] = None,
extra_info: Optional[Dict] = None,
split_documents: Optional[bool] = False,
**kwargs,
) -> List[Document]:
@ -91,8 +91,8 @@ class UnstructuredReader(BaseReader):
continue
metadata[field] = val
if additional_metadata is not None:
metadata.update(additional_metadata)
if extra_info is not None:
metadata.update(extra_info)
metadata["file_name"] = file_name
docs.append(Document(text=node.text, metadata=metadata))
@ -101,8 +101,8 @@ class UnstructuredReader(BaseReader):
text_chunks = [" ".join(str(el).split()) for el in elements]
metadata = {"file_name": file_name, "file_path": file_path}
if additional_metadata is not None:
metadata.update(additional_metadata)
if extra_info is not None:
metadata.update(extra_info)
# Create a single document by joining all the texts
docs.append(Document(text="\n\n".join(text_chunks), metadata=metadata))

View File

@ -96,7 +96,7 @@ class PrepareEvidencePipeline(BaseComponent):
evidence = texts[0].text
print("len (trimmed)", len(evidence))
print(f"PrepareEvidence with input {input}\nOutput: {evidence}\n")
print(f"PrepareEvidence with input {docs}\nOutput: {evidence}\n")
return Document(content=(evidence_mode, evidence))
@ -228,7 +228,7 @@ class FullQAPipeline(BaseComponent):
answering_pipeline: AnswerWithContextPipeline = AnswerWithContextPipeline.withx()
async def run( # type: ignore
self, message: str, cid: str, history: list, **kwargs # type: ignore
self, message: str, conv_id: str, history: list, **kwargs # type: ignore
) -> Document: # type: ignore
docs = []
for retriever in self.retrievers: