70 lines
2.1 KiB
Python
70 lines
2.1 KiB
Python
import os
|
||
|
||
import pandas as pd
|
||
|
||
|
||
def parse_excel_file(file_path):
|
||
"""
|
||
通用表格解析函数,支持 Excel (.xlsx/.xls) 和 CSV 文件
|
||
返回统一格式的数据块列表
|
||
"""
|
||
blocks = []
|
||
|
||
# 根据文件扩展名选择读取方式
|
||
file_ext = os.path.splitext(file_path)[1].lower()
|
||
|
||
try:
|
||
if file_ext in (".xlsx", ".xls"):
|
||
# 处理Excel文件(多sheet)
|
||
all_sheets = pd.read_excel(file_path, sheet_name=None)
|
||
for sheet_name, df in all_sheets.items():
|
||
blocks.extend(_process_dataframe(df, sheet_name))
|
||
|
||
elif file_ext == ".csv":
|
||
# 处理CSV文件(单sheet)
|
||
df = pd.read_csv(file_path)
|
||
blocks.extend(_process_dataframe(df, "CSV"))
|
||
|
||
else:
|
||
raise ValueError(f"Unsupported file format: {file_ext}")
|
||
|
||
except Exception as e:
|
||
raise ValueError(f"Failed to parse file {file_path}: {str(e)}")
|
||
|
||
return blocks
|
||
|
||
|
||
def _process_dataframe(df, sheet_name):
|
||
"""处理单个DataFrame,生成统一格式的数据块"""
|
||
df = df.ffill()
|
||
headers = df.columns.tolist()
|
||
blocks = []
|
||
|
||
for _, row in df.iterrows():
|
||
html_table = "<html><body><table><tr>{}</tr><tr>{}</tr></table></body></html>".format("".join(f"<td>{col}</td>" for col in headers), "".join(f"<td>{row[col]}</td>" for col in headers))
|
||
|
||
block = {"type": "table", "img_path": "", "table_caption": "", "table_footnote": [], "table_body": html_table, "page_idx": 0}
|
||
blocks.append(block)
|
||
|
||
return blocks
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 测试示例
|
||
excel_path = "test.xlsx"
|
||
csv_path = "test.csv"
|
||
|
||
try:
|
||
# 测试Excel解析
|
||
excel_blocks = parse_excel_file(excel_path)
|
||
print(f"Excel解析结果(共{len(excel_blocks)}条):")
|
||
print(excel_blocks[:1]) # 打印第一条示例
|
||
|
||
# 测试CSV解析
|
||
csv_blocks = parse_excel_file(csv_path)
|
||
print(f"\nCSV解析结果(共{len(csv_blocks)}条):")
|
||
print(csv_blocks[:1])
|
||
|
||
except Exception as e:
|
||
print(f"错误: {str(e)}")
|