Fix loaders' file_path and other metadata
This commit is contained in:
parent
c6637ca56e
commit
80ec214107
|
@ -44,6 +44,7 @@ class PandasExcelReader(BaseReader):
|
||||||
file: Path,
|
file: Path,
|
||||||
include_sheetname: bool = False,
|
include_sheetname: bool = False,
|
||||||
sheet_name: Optional[Union[str, int, list]] = None,
|
sheet_name: Optional[Union[str, int, list]] = None,
|
||||||
|
extra_info: Optional[dict] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""Parse file and extract values from a specific column.
|
"""Parse file and extract values from a specific column.
|
||||||
|
@ -92,7 +93,7 @@ class PandasExcelReader(BaseReader):
|
||||||
text=self._row_joiner.join(
|
text=self._row_joiner.join(
|
||||||
self._col_joiner.join(sublist) for sublist in text_list
|
self._col_joiner.join(sublist) for sublist in text_list
|
||||||
),
|
),
|
||||||
metadata={"source": file.stem},
|
metadata=extra_info or {},
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@ import json
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from kotaemon.base import Document
|
from kotaemon.base import Document
|
||||||
|
@ -138,7 +138,9 @@ class MathpixPDFReader(BaseReader):
|
||||||
contents = re.sub(markup_regex, "", contents)
|
contents = re.sub(markup_regex, "", contents)
|
||||||
return contents
|
return contents
|
||||||
|
|
||||||
def load_data(self, file_path: Path, **kwargs) -> List[Document]:
|
def load_data(
|
||||||
|
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
|
||||||
|
) -> List[Document]:
|
||||||
if "response_content" in kwargs:
|
if "response_content" in kwargs:
|
||||||
# overriding response content if specified
|
# overriding response content if specified
|
||||||
content = kwargs["response_content"]
|
content = kwargs["response_content"]
|
||||||
|
@ -154,10 +156,11 @@ class MathpixPDFReader(BaseReader):
|
||||||
for table in tables:
|
for table in tables:
|
||||||
text = strip_special_chars_markdown(table)
|
text = strip_special_chars_markdown(table)
|
||||||
metadata = {
|
metadata = {
|
||||||
"source": file_path.name,
|
|
||||||
"table_origin": table,
|
"table_origin": table,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
}
|
}
|
||||||
|
if extra_info:
|
||||||
|
metadata.update(extra_info)
|
||||||
documents.append(
|
documents.append(
|
||||||
Document(
|
Document(
|
||||||
text=text,
|
text=text,
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
@ -25,7 +25,9 @@ class OCRReader(BaseReader):
|
||||||
self.ocr_endpoint = endpoint
|
self.ocr_endpoint = endpoint
|
||||||
self.use_ocr = use_ocr
|
self.use_ocr = use_ocr
|
||||||
|
|
||||||
def load_data(self, file_path: Path, **kwargs) -> List[Document]:
|
def load_data(
|
||||||
|
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
|
||||||
|
) -> List[Document]:
|
||||||
"""Load data using OCR reader
|
"""Load data using OCR reader
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -63,6 +65,7 @@ class OCRReader(BaseReader):
|
||||||
debug_path=debug_path,
|
debug_path=debug_path,
|
||||||
artifact_path=artifact_path,
|
artifact_path=artifact_path,
|
||||||
)
|
)
|
||||||
|
extra_info = extra_info or {}
|
||||||
|
|
||||||
# create output Document with metadata from table
|
# create output Document with metadata from table
|
||||||
documents = [
|
documents = [
|
||||||
|
@ -72,10 +75,7 @@ class OCRReader(BaseReader):
|
||||||
"table_origin": table_text,
|
"table_origin": table_text,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"page_label": page_id + 1,
|
"page_label": page_id + 1,
|
||||||
"source": file_path.name,
|
**extra_info,
|
||||||
"file_path": str(file_path),
|
|
||||||
"file_name": file_path.name,
|
|
||||||
"filename": str(file_path),
|
|
||||||
},
|
},
|
||||||
metadata_template="",
|
metadata_template="",
|
||||||
metadata_seperator="",
|
metadata_seperator="",
|
||||||
|
@ -87,13 +87,7 @@ class OCRReader(BaseReader):
|
||||||
[
|
[
|
||||||
Document(
|
Document(
|
||||||
text=non_table_text,
|
text=non_table_text,
|
||||||
metadata={
|
metadata={"page_label": page_id + 1, **extra_info},
|
||||||
"page_label": page_id + 1,
|
|
||||||
"source": file_path.name,
|
|
||||||
"file_path": str(file_path),
|
|
||||||
"file_name": file_path.name,
|
|
||||||
"filename": str(file_path),
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
for page_id, non_table_text in texts
|
for page_id, non_table_text in texts
|
||||||
]
|
]
|
||||||
|
|
|
@ -50,7 +50,7 @@ class UnstructuredReader(BaseReader):
|
||||||
def load_data(
|
def load_data(
|
||||||
self,
|
self,
|
||||||
file: Path,
|
file: Path,
|
||||||
additional_metadata: Optional[Dict] = None,
|
extra_info: Optional[Dict] = None,
|
||||||
split_documents: Optional[bool] = False,
|
split_documents: Optional[bool] = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
|
@ -91,8 +91,8 @@ class UnstructuredReader(BaseReader):
|
||||||
continue
|
continue
|
||||||
metadata[field] = val
|
metadata[field] = val
|
||||||
|
|
||||||
if additional_metadata is not None:
|
if extra_info is not None:
|
||||||
metadata.update(additional_metadata)
|
metadata.update(extra_info)
|
||||||
|
|
||||||
metadata["file_name"] = file_name
|
metadata["file_name"] = file_name
|
||||||
docs.append(Document(text=node.text, metadata=metadata))
|
docs.append(Document(text=node.text, metadata=metadata))
|
||||||
|
@ -101,8 +101,8 @@ class UnstructuredReader(BaseReader):
|
||||||
text_chunks = [" ".join(str(el).split()) for el in elements]
|
text_chunks = [" ".join(str(el).split()) for el in elements]
|
||||||
metadata = {"file_name": file_name, "file_path": file_path}
|
metadata = {"file_name": file_name, "file_path": file_path}
|
||||||
|
|
||||||
if additional_metadata is not None:
|
if extra_info is not None:
|
||||||
metadata.update(additional_metadata)
|
metadata.update(extra_info)
|
||||||
|
|
||||||
# Create a single document by joining all the texts
|
# Create a single document by joining all the texts
|
||||||
docs.append(Document(text="\n\n".join(text_chunks), metadata=metadata))
|
docs.append(Document(text="\n\n".join(text_chunks), metadata=metadata))
|
||||||
|
|
|
@ -96,7 +96,7 @@ class PrepareEvidencePipeline(BaseComponent):
|
||||||
evidence = texts[0].text
|
evidence = texts[0].text
|
||||||
print("len (trimmed)", len(evidence))
|
print("len (trimmed)", len(evidence))
|
||||||
|
|
||||||
print(f"PrepareEvidence with input {input}\nOutput: {evidence}\n")
|
print(f"PrepareEvidence with input {docs}\nOutput: {evidence}\n")
|
||||||
|
|
||||||
return Document(content=(evidence_mode, evidence))
|
return Document(content=(evidence_mode, evidence))
|
||||||
|
|
||||||
|
@ -228,7 +228,7 @@ class FullQAPipeline(BaseComponent):
|
||||||
answering_pipeline: AnswerWithContextPipeline = AnswerWithContextPipeline.withx()
|
answering_pipeline: AnswerWithContextPipeline = AnswerWithContextPipeline.withx()
|
||||||
|
|
||||||
async def run( # type: ignore
|
async def run( # type: ignore
|
||||||
self, message: str, cid: str, history: list, **kwargs # type: ignore
|
self, message: str, conv_id: str, history: list, **kwargs # type: ignore
|
||||||
) -> Document: # type: ignore
|
) -> Document: # type: ignore
|
||||||
docs = []
|
docs = []
|
||||||
for retriever in self.retrievers:
|
for retriever in self.retrievers:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user