Migrate the MVP into kotaemon (#108)
- Migrate the MVP into kotaemon. - Preliminary include the pipeline within chatbot interface. - Organize MVP as an application. Todo: - Add an info panel to view the planning of agents -> Fix streaming agents' output. Resolve: #60 Resolve: #61 Resolve: #62
This commit is contained in:
committed by
GitHub
parent
230328c62f
commit
5a9d6f75be
@@ -26,11 +26,7 @@ class OCRReader(BaseReader):
|
||||
self.ocr_endpoint = endpoint
|
||||
self.use_ocr = use_ocr
|
||||
|
||||
def load_data(
|
||||
self,
|
||||
file_path: Path,
|
||||
**kwargs,
|
||||
) -> List[Document]:
|
||||
def load_data(self, file_path: Path, **kwargs) -> List[Document]:
|
||||
"""Load data using OCR reader
|
||||
|
||||
Args:
|
||||
@@ -41,23 +37,24 @@ class OCRReader(BaseReader):
|
||||
Returns:
|
||||
List[Document]: list of documents extracted from the PDF file
|
||||
"""
|
||||
# create input params for the requests
|
||||
content = open(file_path, "rb")
|
||||
files = {"input": content}
|
||||
data = {"job_id": uuid4(), "table_only": not self.use_ocr}
|
||||
file_path = Path(file_path).resolve()
|
||||
|
||||
with file_path.open("rb") as content:
|
||||
files = {"input": content}
|
||||
data = {"job_id": uuid4(), "table_only": not self.use_ocr}
|
||||
|
||||
# call the API from FullOCR endpoint
|
||||
if "response_content" in kwargs:
|
||||
# overriding response content if specified
|
||||
ocr_results = kwargs["response_content"]
|
||||
else:
|
||||
# call original API
|
||||
resp = requests.post(url=self.ocr_endpoint, files=files, data=data)
|
||||
ocr_results = resp.json()["result"]
|
||||
|
||||
debug_path = kwargs.pop("debug_path", None)
|
||||
artifact_path = kwargs.pop("artifact_path", None)
|
||||
|
||||
# call the API from FullOCR endpoint
|
||||
if "response_content" in kwargs:
|
||||
# overriding response content if specified
|
||||
ocr_results = kwargs["response_content"]
|
||||
else:
|
||||
# call original API
|
||||
resp = requests.post(url=self.ocr_endpoint, files=files, data=data)
|
||||
ocr_results = resp.json()["result"]
|
||||
|
||||
# read PDF through normal reader (unstructured)
|
||||
pdf_page_items = read_pdf_unstructured(file_path)
|
||||
# merge PDF text output with OCR output
|
||||
@@ -77,6 +74,9 @@ class OCRReader(BaseReader):
|
||||
"type": "table",
|
||||
"page_label": page_id + 1,
|
||||
"source": file_path.name,
|
||||
"file_path": str(file_path),
|
||||
"file_name": file_path.name,
|
||||
"filename": str(file_path),
|
||||
},
|
||||
metadata_template="",
|
||||
metadata_seperator="",
|
||||
@@ -91,6 +91,9 @@ class OCRReader(BaseReader):
|
||||
metadata={
|
||||
"page_label": page_id + 1,
|
||||
"source": file_path.name,
|
||||
"file_path": str(file_path),
|
||||
"file_name": file_path.name,
|
||||
"filename": str(file_path),
|
||||
},
|
||||
)
|
||||
for page_id, non_table_text in texts
|
||||
|
@@ -74,9 +74,10 @@ class UnstructuredReader(BaseReader):
|
||||
""" Process elements """
|
||||
docs = []
|
||||
file_name = Path(file).name
|
||||
file_path = str(Path(file).resolve())
|
||||
if split_documents:
|
||||
for node in elements:
|
||||
metadata = {"file_name": file_name}
|
||||
metadata = {"file_name": file_name, "file_path": file_path}
|
||||
if hasattr(node, "metadata"):
|
||||
"""Load metadata fields"""
|
||||
for field, val in vars(node.metadata).items():
|
||||
@@ -99,7 +100,7 @@ class UnstructuredReader(BaseReader):
|
||||
|
||||
else:
|
||||
text_chunks = [" ".join(str(el).split()) for el in elements]
|
||||
metadata = {"file_name": file_name}
|
||||
metadata = {"file_name": file_name, "file_path": file_path}
|
||||
|
||||
if additional_metadata is not None:
|
||||
metadata.update(additional_metadata)
|
||||
|
Reference in New Issue
Block a user