Merge pull request #157 from xinsenyan/main

refactor: 更新解析excel的函数，适配更复杂的表格结构
2025-06-09 19:07:56 +08:00 · 2025-06-09 19:07:56 +08:00 · 0640f973aa
parent cd2f479bb4 d5fbd8d620
commit 0640f973aa
1 changed files with 10 additions and 9 deletions
--- a/management/server/services/knowledgebases/excel_parser.py
+++ b/management/server/services/knowledgebases/excel_parser.py
@ -2,18 +2,19 @@ import pandas as pd
 def parse_excel(file_path):
-    # 读取Excel文件
+    # 读取所有工作表
-    df = pd.read_excel(file_path)
+    all_sheets = pd.read_excel(file_path, sheet_name=None)  # 读取所有sheet
-    # 获取表头
+
    headers = df.columns.tolist()
    blocks = []
-    for _, row in df.iterrows():
+    for sheet_name, df in all_sheets.items():
-        # 构建HTML表格
+        df = df.fillna(method="ffill")  # 填充合并的单元格
-        html_table = "<html><body><table><tr>{}</tr><tr>{}</tr></table></body></html>".format("".join(f"<td>{col}</td>" for col in headers), "".join(f"<td>{row[col]}</td>" for col in headers))
+        headers = df.columns.tolist()
        block = {"type": "table", "img_path": "", "table_caption": [], "table_footnote": [], "table_body": f"{html_table}", "page_idx": 0}
-        blocks.append(block)
+        for _, row in df.iterrows():
            html_table = "<html><body><table><tr>{}</tr><tr>{}</tr></table></body></html>".format("".join(f"<td>{col}</td>" for col in headers), "".join(f"<td>{row[col]}</td>" for col in headers))
            block = {"type": "table", "img_path": "", "table_caption": [f"Sheet: {sheet_name}"], "table_footnote": [], "table_body": f"{html_table}", "page_idx": 0}
            blocks.append(block)
    return blocks