import os import pandas as pd def parse_excel_file(file_path): """ 通用表格解析函数,支持 Excel (.xlsx/.xls) 和 CSV 文件 返回统一格式的数据块列表 """ blocks = [] # 根据文件扩展名选择读取方式 file_ext = os.path.splitext(file_path)[1].lower() try: if file_ext in (".xlsx", ".xls"): # 处理Excel文件(多sheet) all_sheets = pd.read_excel(file_path, sheet_name=None) for sheet_name, df in all_sheets.items(): blocks.extend(_process_dataframe(df, sheet_name)) elif file_ext == ".csv": # 处理CSV文件(单sheet) df = pd.read_csv(file_path) blocks.extend(_process_dataframe(df, "CSV")) else: raise ValueError(f"Unsupported file format: {file_ext}") except Exception as e: raise ValueError(f"Failed to parse file {file_path}: {str(e)}") return blocks def _process_dataframe(df, sheet_name): """处理单个DataFrame,生成统一格式的数据块""" df = df.ffill() headers = df.columns.tolist() blocks = [] for _, row in df.iterrows(): html_table = "{}{}
".format("".join(f"{col}" for col in headers), "".join(f"{row[col]}" for col in headers)) block = {"type": "table", "img_path": "", "table_caption": "", "table_footnote": [], "table_body": html_table, "page_idx": 0} blocks.append(block) return blocks if __name__ == "__main__": # 测试示例 excel_path = "test.xlsx" csv_path = "test.csv" try: # 测试Excel解析 excel_blocks = parse_excel_file(excel_path) print(f"Excel解析结果(共{len(excel_blocks)}条):") print(excel_blocks[:1]) # 打印第一条示例 # 测试CSV解析 csv_blocks = parse_excel_file(csv_path) print(f"\nCSV解析结果(共{len(csv_blocks)}条):") print(csv_blocks[:1]) except Exception as e: print(f"错误: {str(e)}")