Migrate the MVP into kotaemon (#108)

- Migrate the MVP into kotaemon.
- Preliminary include the pipeline within chatbot interface.
- Organize MVP as an application.

Todo:

- Add an info panel to view the planning of agents -> Fix streaming agents' output.

Resolve: #60
Resolve: #61 
Resolve: #62
This commit is contained in:
Duc Nguyen (john)
2024-01-10 15:28:09 +07:00
committed by GitHub
parent 230328c62f
commit 5a9d6f75be
31 changed files with 273 additions and 92 deletions

View File

@@ -26,11 +26,7 @@ class OCRReader(BaseReader):
self.ocr_endpoint = endpoint
self.use_ocr = use_ocr
def load_data(
self,
file_path: Path,
**kwargs,
) -> List[Document]:
def load_data(self, file_path: Path, **kwargs) -> List[Document]:
"""Load data using OCR reader
Args:
@@ -41,23 +37,24 @@ class OCRReader(BaseReader):
Returns:
List[Document]: list of documents extracted from the PDF file
"""
# create input params for the requests
content = open(file_path, "rb")
files = {"input": content}
data = {"job_id": uuid4(), "table_only": not self.use_ocr}
file_path = Path(file_path).resolve()
with file_path.open("rb") as content:
files = {"input": content}
data = {"job_id": uuid4(), "table_only": not self.use_ocr}
# call the API from FullOCR endpoint
if "response_content" in kwargs:
# overriding response content if specified
ocr_results = kwargs["response_content"]
else:
# call original API
resp = requests.post(url=self.ocr_endpoint, files=files, data=data)
ocr_results = resp.json()["result"]
debug_path = kwargs.pop("debug_path", None)
artifact_path = kwargs.pop("artifact_path", None)
# call the API from FullOCR endpoint
if "response_content" in kwargs:
# overriding response content if specified
ocr_results = kwargs["response_content"]
else:
# call original API
resp = requests.post(url=self.ocr_endpoint, files=files, data=data)
ocr_results = resp.json()["result"]
# read PDF through normal reader (unstructured)
pdf_page_items = read_pdf_unstructured(file_path)
# merge PDF text output with OCR output
@@ -77,6 +74,9 @@ class OCRReader(BaseReader):
"type": "table",
"page_label": page_id + 1,
"source": file_path.name,
"file_path": str(file_path),
"file_name": file_path.name,
"filename": str(file_path),
},
metadata_template="",
metadata_seperator="",
@@ -91,6 +91,9 @@ class OCRReader(BaseReader):
metadata={
"page_label": page_id + 1,
"source": file_path.name,
"file_path": str(file_path),
"file_name": file_path.name,
"filename": str(file_path),
},
)
for page_id, non_table_text in texts

View File

@@ -74,9 +74,10 @@ class UnstructuredReader(BaseReader):
""" Process elements """
docs = []
file_name = Path(file).name
file_path = str(Path(file).resolve())
if split_documents:
for node in elements:
metadata = {"file_name": file_name}
metadata = {"file_name": file_name, "file_path": file_path}
if hasattr(node, "metadata"):
"""Load metadata fields"""
for field, val in vars(node.metadata).items():
@@ -99,7 +100,7 @@ class UnstructuredReader(BaseReader):
else:
text_chunks = [" ".join(str(el).split()) for el in elements]
metadata = {"file_name": file_name}
metadata = {"file_name": file_name, "file_path": file_path}
if additional_metadata is not None:
metadata.update(additional_metadata)