refactor(knowledgebases): 重构excel文件的解析逻辑

This commit is contained in:
zstar 2025-06-02 18:10:22 +08:00
parent dd2b661703
commit 45b7233432
2 changed files with 35 additions and 80 deletions

View File

@ -2,15 +2,13 @@ import os
import tempfile
import shutil
import json
import mysql.connector
import time
import traceback
import re
import requests
from bs4 import BeautifulSoup
from io import BytesIO
from datetime import datetime
from database import MINIO_CONFIG, DB_CONFIG, get_minio_client, get_es_client
from database import MINIO_CONFIG, get_minio_client, get_es_client, get_db_connection
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
@ -18,7 +16,7 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.read_api import read_local_office, read_local_images
from utils import generate_uuid
from .rag_tokenizer import RagTokenizer
from .excel_parser import parse_excel
tknzr = RagTokenizer()
@ -54,17 +52,12 @@ def merge_chunks(sections, chunk_token_num=128, delimiter="\n。"):
return chunks
def _get_db_connection():
"""创建数据库连接"""
return mysql.connector.connect(**DB_CONFIG)
def _update_document_progress(doc_id, progress=None, message=None, status=None, run=None, chunk_count=None, process_duration=None):
"""更新数据库中文档的进度和状态"""
conn = None
cursor = None
try:
conn = _get_db_connection()
conn = get_db_connection()
cursor = conn.cursor()
updates = []
params = []
@ -109,7 +102,7 @@ def _update_kb_chunk_count(kb_id, count_delta):
conn = None
cursor = None
try:
conn = _get_db_connection()
conn = get_db_connection()
cursor = conn.cursor()
current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
kb_update = """
@ -134,7 +127,7 @@ def _create_task_record(doc_id, chunk_ids_list):
conn = None
cursor = None
try:
conn = _get_db_connection()
conn = get_db_connection()
cursor = conn.cursor()
task_id = generate_uuid()
current_datetime = datetime.now()
@ -184,58 +177,6 @@ def get_bbox_from_block(block):
return [0, 0, 0, 0]
def process_table_content(content_list):
"""
处理表格内容将每一行分开存储
Args:
content_list: 原始内容列表
Returns:
处理后的内容列表
"""
new_content_list = []
for item in content_list:
if "table_body" in item and item["table_body"]:
# 使用BeautifulSoup解析HTML表格
soup = BeautifulSoup(item["table_body"], "html.parser")
table = soup.find("table")
if table:
rows = table.find_all("tr")
# 获取表头(第一行)
header_row = rows[0] if rows else None
# 处理每一行,从第二行开始(跳过表头)
for i, row in enumerate(rows):
# 创建新的内容项
new_item = item.copy()
# 创建只包含当前行的表格
new_table = soup.new_tag("table")
# 如果有表头,添加表头
if header_row and i > 0:
new_table.append(header_row)
# 添加当前行
new_table.append(row)
# 创建新的HTML结构
new_html = f"<html><body>{str(new_table)}</body></html>"
new_item["table_body"] = f"\n\n{new_html}\n\n"
# 添加到新的内容列表
new_content_list.append(new_item)
else:
new_content_list.append(item)
else:
new_content_list.append(item)
return new_content_list
def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
"""
执行文档解析的核心逻辑
@ -343,7 +284,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
update_progress(0.3, "分析PDF类型")
is_ocr = ds.classify() == SupportedPdfParseMethod.OCR
mode_msg = "OCR模式" if is_ocr else "文本模式"
update_progress(0.4, f"使用{mode_msg}处理PDF")
update_progress(0.4, f"使用{mode_msg}处理PDF,处理中,具体进度可查看日志")
infer_result = ds.apply(doc_analyze, ocr=is_ocr)
@ -387,6 +328,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
# 获取内容列表JSON格式
middle_content = pipe_result.get_middle_json()
middle_json_content = json.loads(middle_content)
# 对excel文件单独进行处理
elif file_type.endswith("excel"):
update_progress(0.3, "使用MinerU解析器")
@ -397,22 +339,11 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
f.write(file_content)
print(f"[Parser-INFO] 临时文件路径: {temp_file_path}")
# 使用MinerU处理
ds = read_local_office(temp_file_path)[0]
infer_result = ds.apply(doc_analyze, ocr=True)
# 设置临时输出目录
temp_image_dir = os.path.join(temp_dir, f"images_{doc_id}")
os.makedirs(temp_image_dir, exist_ok=True)
image_writer = FileBasedDataWriter(temp_image_dir)
update_progress(0.6, "处理文件结果")
pipe_result = infer_result.pipe_txt_mode(image_writer)
update_progress(0.8, "提取内容")
origin_content_list = pipe_result.get_content_list(os.path.basename(temp_image_dir))
# 处理内容列表
content_list = process_table_content(origin_content_list)
content_list = parse_excel(temp_file_path)
elif file_type.endswith("visual"):
update_progress(0.3, "使用MinerU解析器")
@ -430,7 +361,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
update_progress(0.3, "分析PDF类型")
is_ocr = ds.classify() == SupportedPdfParseMethod.OCR
mode_msg = "OCR模式" if is_ocr else "文本模式"
update_progress(0.4, f"使用{mode_msg}处理PDF")
update_progress(0.4, f"使用{mode_msg}处理PDF,处理中,具体进度可查看日志")
infer_result = ds.apply(doc_analyze, ocr=is_ocr)
@ -690,7 +621,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
conn = None
cursor = None
try:
conn = _get_db_connection()
conn = get_db_connection()
cursor = conn.cursor()
# 为每个文本块找到最近的图片

View File

@ -0,0 +1,24 @@
import pandas as pd
def parse_excel(file_path):
# 读取Excel文件
df = pd.read_excel(file_path)
# 获取表头
headers = df.columns.tolist()
blocks = []
for _, row in df.iterrows():
# 构建HTML表格
html_table = "<html><body><table><tr>{}</tr><tr>{}</tr></table></body></html>".format("".join(f"<td>{col}</td>" for col in headers), "".join(f"<td>{row[col]}</td>" for col in headers))
block = {"type": "table", "img_path": "", "table_caption": [], "table_footnote": [], "table_body": f"{html_table}", "page_idx": 0}
blocks.append(block)
return blocks
if __name__ == "__main__":
file_path = "test_excel.xls"
parse_excel_result = parse_excel(file_path)
print(parse_excel_result)