RAGflow/management/server/services/knowledgebases/excel_parser.py

70 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import pandas as pd
def parse_excel_file(file_path):
"""
通用表格解析函数,支持 Excel (.xlsx/.xls) 和 CSV 文件
返回统一格式的数据块列表
"""
blocks = []
# 根据文件扩展名选择读取方式
file_ext = os.path.splitext(file_path)[1].lower()
try:
if file_ext in (".xlsx", ".xls"):
# 处理Excel文件多sheet
all_sheets = pd.read_excel(file_path, sheet_name=None)
for sheet_name, df in all_sheets.items():
blocks.extend(_process_dataframe(df, sheet_name))
elif file_ext == ".csv":
# 处理CSV文件单sheet
df = pd.read_csv(file_path)
blocks.extend(_process_dataframe(df, "CSV"))
else:
raise ValueError(f"Unsupported file format: {file_ext}")
except Exception as e:
raise ValueError(f"Failed to parse file {file_path}: {str(e)}")
return blocks
def _process_dataframe(df, sheet_name):
"""处理单个DataFrame生成统一格式的数据块"""
df = df.ffill()
headers = df.columns.tolist()
blocks = []
for _, row in df.iterrows():
html_table = "<html><body><table><tr>{}</tr><tr>{}</tr></table></body></html>".format("".join(f"<td>{col}</td>" for col in headers), "".join(f"<td>{row[col]}</td>" for col in headers))
block = {"type": "table", "img_path": "", "table_caption": "", "table_footnote": [], "table_body": html_table, "page_idx": 0}
blocks.append(block)
return blocks
if __name__ == "__main__":
# 测试示例
excel_path = "test.xlsx"
csv_path = "test.csv"
try:
# 测试Excel解析
excel_blocks = parse_excel_file(excel_path)
print(f"Excel解析结果{len(excel_blocks)}条):")
print(excel_blocks[:1]) # 打印第一条示例
# 测试CSV解析
csv_blocks = parse_excel_file(csv_path)
print(f"\nCSV解析结果{len(csv_blocks)}条):")
print(csv_blocks[:1])
except Exception as e:
print(f"错误: {str(e)}")