Fix integrating indexing and retrieval pipelines to FileIndex (#155)
* Add docs for settings * Add mdx_truly_sane_lists to doc requirements
This commit is contained in:
committed by
GitHub
parent
2b3571e892
commit
cb01d27d19
@@ -20,13 +20,12 @@ class DocxReader(BaseReader):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
try:
|
||||
import docx
|
||||
import docx # noqa
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"docx is not installed. "
|
||||
"Please install it using `pip install python-docx`"
|
||||
)
|
||||
self._module = docx
|
||||
|
||||
def load_data(
|
||||
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
|
||||
@@ -39,9 +38,11 @@ class DocxReader(BaseReader):
|
||||
Returns:
|
||||
List[Document]: list of documents extracted from the HTML file
|
||||
"""
|
||||
import docx
|
||||
|
||||
file_path = Path(file_path).resolve()
|
||||
|
||||
doc = self._module.Document(str(file_path))
|
||||
doc = docx.Document(str(file_path))
|
||||
all_text = "\n".join(
|
||||
[unicodedata.normalize("NFKC", p.text) for p in doc.paragraphs]
|
||||
)
|
||||
|
@@ -1,4 +1,3 @@
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
@@ -22,41 +21,37 @@ class HtmlReader(BaseReader):
|
||||
|
||||
def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):
|
||||
try:
|
||||
import html2text
|
||||
import html2text # noqa
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"html2text is not installed. "
|
||||
"Please install it using `pip install html2text`"
|
||||
)
|
||||
|
||||
self._module = html2text
|
||||
self._page_break_pattern: Optional[str] = page_break_pattern
|
||||
super().__init__()
|
||||
|
||||
def load_data(
|
||||
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
|
||||
self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs
|
||||
) -> List[Document]:
|
||||
"""Load data using Html reader
|
||||
|
||||
Args:
|
||||
file_path (Path): Path to PDF file
|
||||
debug_path (Path): Path to store debug image output
|
||||
artifact_path (Path): Path to OCR endpoints artifacts directory
|
||||
file_path: path to pdf file
|
||||
extra_info: extra information passed to this reader during extracting data
|
||||
|
||||
Returns:
|
||||
List[Document]: list of documents extracted from the HTML file
|
||||
list[Document]: list of documents extracted from the HTML file
|
||||
"""
|
||||
import html2text
|
||||
|
||||
file_path = Path(file_path).resolve()
|
||||
|
||||
with file_path.open("r") as content:
|
||||
html_text = "".join(
|
||||
[
|
||||
unicodedata.normalize("NFKC", line[:-1])
|
||||
for line in content.readlines()
|
||||
]
|
||||
)
|
||||
with file_path.open("r") as f:
|
||||
html_text = "".join([line[:-1] for line in f.readlines()])
|
||||
|
||||
# read HTML
|
||||
all_text = self._module.html2text(html_text)
|
||||
all_text = html2text.html2text(html_text)
|
||||
pages = (
|
||||
all_text.split(self._page_break_pattern)
|
||||
if self._page_break_pattern
|
||||
|
Reference in New Issue
Block a user