"""Pandas Excel reader. Pandas parser for .xlsx files. """ from pathlib import Path from typing import Any, List, Optional, Union from llama_index.readers.base import BaseReader from kotaemon.base import Document class PandasExcelReader(BaseReader): r"""Pandas-based CSV parser. Parses CSVs using the separator detection from Pandas `read_csv` function. If special parameters are required, use the `pandas_config` dict. Args: pandas_config (dict): Options for the `pandas.read_excel` function call. Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html for more information. Set to empty dict by default, this means defaults will be used. """ def __init__( self, *args: Any, pandas_config: Optional[dict] = None, row_joiner: str = "\n", col_joiner: str = " ", **kwargs: Any, ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._pandas_config = pandas_config or {} self._row_joiner = row_joiner if row_joiner else "\n" self._col_joiner = col_joiner if col_joiner else " " def load_data( self, file: Path, include_sheetname: bool = False, sheet_name: Optional[Union[str, int, list]] = None, **kwargs, ) -> List[Document]: """Parse file and extract values from a specific column. Args: file (Path): The path to the Excel file to read. include_sheetname (bool): Whether to include the sheet name in the output. sheet_name (Union[str, int, None]): The specific sheet to read from, default is None which reads all sheets. Returns: List[Document]: A list of`Document objects containing the values from the specified column in the Excel file. """ import itertools try: import pandas as pd except ImportError: raise ImportError( "install pandas using `pip3 install pandas` to use this loader" ) if sheet_name is not None: sheet_name = ( [sheet_name] if not isinstance(sheet_name, list) else sheet_name ) dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config) sheet_names = dfs.keys() df_sheets = [] for key in sheet_names: sheet = [] if include_sheetname: sheet.append([key]) sheet.extend(dfs[key].values.astype(str).tolist()) df_sheets.append(sheet) text_list = list( itertools.chain.from_iterable(df_sheets) ) # flatten list of lists output = [ Document( text=self._row_joiner.join( self._col_joiner.join(sublist) for sublist in text_list ), metadata={"source": file.stem}, ) ] return output