2025-06-12 22:50:19 +08:00
|
|
|
|
import os
|
|
|
|
|
|
2025-06-02 18:10:22 +08:00
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
2025-06-12 22:50:19 +08:00
|
|
|
|
def parse_excel_file(file_path):
|
|
|
|
|
"""
|
|
|
|
|
通用表格解析函数,支持 Excel (.xlsx/.xls) 和 CSV 文件
|
|
|
|
|
返回统一格式的数据块列表
|
|
|
|
|
"""
|
|
|
|
|
blocks = []
|
|
|
|
|
|
|
|
|
|
# 根据文件扩展名选择读取方式
|
|
|
|
|
file_ext = os.path.splitext(file_path)[1].lower()
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
if file_ext in (".xlsx", ".xls"):
|
|
|
|
|
# 处理Excel文件(多sheet)
|
|
|
|
|
all_sheets = pd.read_excel(file_path, sheet_name=None)
|
|
|
|
|
for sheet_name, df in all_sheets.items():
|
|
|
|
|
blocks.extend(_process_dataframe(df, sheet_name))
|
|
|
|
|
|
|
|
|
|
elif file_ext == ".csv":
|
|
|
|
|
# 处理CSV文件(单sheet)
|
|
|
|
|
df = pd.read_csv(file_path)
|
|
|
|
|
blocks.extend(_process_dataframe(df, "CSV"))
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f"Unsupported file format: {file_ext}")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise ValueError(f"Failed to parse file {file_path}: {str(e)}")
|
2025-06-09 19:06:09 +08:00
|
|
|
|
|
2025-06-12 22:50:19 +08:00
|
|
|
|
return blocks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _process_dataframe(df, sheet_name):
|
|
|
|
|
"""处理单个DataFrame,生成统一格式的数据块"""
|
|
|
|
|
df = df.ffill()
|
|
|
|
|
headers = df.columns.tolist()
|
2025-06-09 19:06:09 +08:00
|
|
|
|
blocks = []
|
|
|
|
|
|
2025-06-12 22:50:19 +08:00
|
|
|
|
for _, row in df.iterrows():
|
|
|
|
|
html_table = "<html><body><table><tr>{}</tr><tr>{}</tr></table></body></html>".format("".join(f"<td>{col}</td>" for col in headers), "".join(f"<td>{row[col]}</td>" for col in headers))
|
2025-06-09 19:06:09 +08:00
|
|
|
|
|
2025-06-12 22:50:19 +08:00
|
|
|
|
block = {"type": "table", "img_path": "", "table_caption": "", "table_footnote": [], "table_body": html_table, "page_idx": 0}
|
|
|
|
|
blocks.append(block)
|
2025-06-09 19:06:09 +08:00
|
|
|
|
|
2025-06-02 18:10:22 +08:00
|
|
|
|
return blocks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2025-06-12 22:50:19 +08:00
|
|
|
|
# 测试示例
|
|
|
|
|
excel_path = "test.xlsx"
|
|
|
|
|
csv_path = "test.csv"
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 测试Excel解析
|
|
|
|
|
excel_blocks = parse_excel_file(excel_path)
|
|
|
|
|
print(f"Excel解析结果(共{len(excel_blocks)}条):")
|
|
|
|
|
print(excel_blocks[:1]) # 打印第一条示例
|
|
|
|
|
|
|
|
|
|
# 测试CSV解析
|
|
|
|
|
csv_blocks = parse_excel_file(csv_path)
|
|
|
|
|
print(f"\nCSV解析结果(共{len(csv_blocks)}条):")
|
|
|
|
|
print(csv_blocks[:1])
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"错误: {str(e)}")
|