refactor(knowledgebases): 重构excel文件的解析逻辑

2025-06-02 18:10:22 +08:00 · 2025-06-02 18:10:22 +08:00 · 45b7233432
parent dd2b661703
commit 45b7233432
2 changed files with 35 additions and 80 deletions
--- a/management/server/services/knowledgebases/document_parser.py
+++ b/management/server/services/knowledgebases/document_parser.py
@ -2,15 +2,13 @@ import os
 import tempfile
 import shutil
 import json
-import mysql.connector
 import time
 import traceback
 import re
 import requests
-from bs4 import BeautifulSoup
 from io import BytesIO
 from datetime import datetime
-from database import MINIO_CONFIG, DB_CONFIG, get_minio_client, get_es_client
+from database import MINIO_CONFIG, get_minio_client, get_es_client, get_db_connection
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
@ -18,7 +16,7 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.data.read_api import read_local_office, read_local_images
 from utils import generate_uuid
 from .rag_tokenizer import RagTokenizer
-
+from .excel_parser import parse_excel

 tknzr = RagTokenizer()

@ -54,17 +52,12 @@ def merge_chunks(sections, chunk_token_num=128, delimiter="\n。；！？"):
    return chunks


-def _get_db_connection():
-    """创建数据库连接"""
-    return mysql.connector.connect(**DB_CONFIG)
-
-
 def _update_document_progress(doc_id, progress=None, message=None, status=None, run=None, chunk_count=None, process_duration=None):
    """更新数据库中文档的进度和状态"""
    conn = None
    cursor = None
    try:
-        conn = _get_db_connection()
+        conn = get_db_connection()
        cursor = conn.cursor()
        updates = []
        params = []
@ -109,7 +102,7 @@ def _update_kb_chunk_count(kb_id, count_delta):
    conn = None
    cursor = None
    try:
-        conn = _get_db_connection()
+        conn = get_db_connection()
        cursor = conn.cursor()
        current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        kb_update = """
@ -134,7 +127,7 @@ def _create_task_record(doc_id, chunk_ids_list):
    conn = None
    cursor = None
    try:
-        conn = _get_db_connection()
+        conn = get_db_connection()
        cursor = conn.cursor()
        task_id = generate_uuid()
        current_datetime = datetime.now()
@ -184,58 +177,6 @@ def get_bbox_from_block(block):
    return [0, 0, 0, 0]


-def process_table_content(content_list):
-    """
-    处理表格内容，将每一行分开存储
-
-    Args:
-        content_list: 原始内容列表
-
-    Returns:
-        处理后的内容列表
-    """
-    new_content_list = []
-
-    for item in content_list:
-        if "table_body" in item and item["table_body"]:
-            # 使用BeautifulSoup解析HTML表格
-            soup = BeautifulSoup(item["table_body"], "html.parser")
-            table = soup.find("table")
-
-            if table:
-                rows = table.find_all("tr")
-                # 获取表头（第一行）
-                header_row = rows[0] if rows else None
-
-                # 处理每一行，从第二行开始（跳过表头）
-                for i, row in enumerate(rows):
-                    # 创建新的内容项
-                    new_item = item.copy()
-
-                    # 创建只包含当前行的表格
-                    new_table = soup.new_tag("table")
-
-                    # 如果有表头，添加表头
-                    if header_row and i > 0:
-                        new_table.append(header_row)
-
-                    # 添加当前行
-                    new_table.append(row)
-
-                    # 创建新的HTML结构
-                    new_html = f"<html><body>{str(new_table)}</body></html>"
-                    new_item["table_body"] = f"\n\n{new_html}\n\n"
-
-                    # 添加到新的内容列表
-                    new_content_list.append(new_item)
-            else:
-                new_content_list.append(item)
-        else:
-            new_content_list.append(item)
-
-    return new_content_list
-
-
 def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
    """
    执行文档解析的核心逻辑
@ -343,7 +284,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
            update_progress(0.3, "分析PDF类型")
            is_ocr = ds.classify() == SupportedPdfParseMethod.OCR
            mode_msg = "OCR模式" if is_ocr else "文本模式"
-            update_progress(0.4, f"使用{mode_msg}处理PDF")
+            update_progress(0.4, f"使用{mode_msg}处理PDF，处理中，具体进度可查看日志")

            infer_result = ds.apply(doc_analyze, ocr=is_ocr)

@ -387,6 +328,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
            # 获取内容列表（JSON格式）
            middle_content = pipe_result.get_middle_json()
            middle_json_content = json.loads(middle_content)
+
        # 对excel文件单独进行处理
        elif file_type.endswith("excel"):
            update_progress(0.3, "使用MinerU解析器")
@ -397,22 +339,11 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
                f.write(file_content)

            print(f"[Parser-INFO] 临时文件路径: {temp_file_path}")
-            # 使用MinerU处理
-            ds = read_local_office(temp_file_path)[0]
-            infer_result = ds.apply(doc_analyze, ocr=True)
-
-            # 设置临时输出目录
-            temp_image_dir = os.path.join(temp_dir, f"images_{doc_id}")
-            os.makedirs(temp_image_dir, exist_ok=True)
-            image_writer = FileBasedDataWriter(temp_image_dir)
-
-            update_progress(0.6, "处理文件结果")
-            pipe_result = infer_result.pipe_txt_mode(image_writer)

            update_progress(0.8, "提取内容")
-            origin_content_list = pipe_result.get_content_list(os.path.basename(temp_image_dir))
            # 处理内容列表
-            content_list = process_table_content(origin_content_list)
+            content_list = parse_excel(temp_file_path)
+
        elif file_type.endswith("visual"):
            update_progress(0.3, "使用MinerU解析器")

@ -430,7 +361,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
            update_progress(0.3, "分析PDF类型")
            is_ocr = ds.classify() == SupportedPdfParseMethod.OCR
            mode_msg = "OCR模式" if is_ocr else "文本模式"
-            update_progress(0.4, f"使用{mode_msg}处理PDF")
+            update_progress(0.4, f"使用{mode_msg}处理PDF，处理中，具体进度可查看日志")

            infer_result = ds.apply(doc_analyze, ocr=is_ocr)

@ -690,7 +621,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
            conn = None
            cursor = None
            try:
-                conn = _get_db_connection()
+                conn = get_db_connection()
                cursor = conn.cursor()

                # 为每个文本块找到最近的图片
--- a/management/server/services/knowledgebases/excel_parser.py
+++ b/management/server/services/knowledgebases/excel_parser.py
@ -0,0 +1,24 @@
+import pandas as pd
+
+
+def parse_excel(file_path):
+    # 读取Excel文件
+    df = pd.read_excel(file_path)
+    # 获取表头
+    headers = df.columns.tolist()
+    blocks = []
+
+    for _, row in df.iterrows():
+        # 构建HTML表格
+        html_table = "<html><body><table><tr>{}</tr><tr>{}</tr></table></body></html>".format("".join(f"<td>{col}</td>" for col in headers), "".join(f"<td>{row[col]}</td>" for col in headers))
+        block = {"type": "table", "img_path": "", "table_caption": [], "table_footnote": [], "table_body": f"{html_table}", "page_idx": 0}
+
+        blocks.append(block)
+
+    return blocks
+
+
+if __name__ == "__main__":
+    file_path = "test_excel.xls"
+    parse_excel_result = parse_excel(file_path)
+    print(parse_excel_result)