From dd2b661703bae85e2b9d0b049c7071ff6c6f013f Mon Sep 17 00:00:00 2001
From: zstar <65890619+zstar1003@users.noreply.github.com>
Date: Mon, 2 Jun 2025 13:47:15 +0800
Subject: [PATCH] =?UTF-8?q?feat(document=5Fparser):=20=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?=E8=A7=A3=E6=9E=90=E6=95=B0=E5=AD=A6=E5=85=AC=E5=BC=8F=E7=B1=BB?=
 =?UTF-8?q?=E5=9E=8B=E7=9A=84=E6=95=B0=E6=8D=AE=E5=9D=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 在文本和表格类型之外，增加了对数学公式（equation）类型数据块的处理
---
 .../server/services/knowledgebases/document_parser.py       | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/management/server/services/knowledgebases/document_parser.py b/management/server/services/knowledgebases/document_parser.py
index 48e3ee8..92c4f18 100644
--- a/management/server/services/knowledgebases/document_parser.py
+++ b/management/server/services/knowledgebases/document_parser.py
@@ -524,13 +524,17 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
                 if chunk_idx == len(block_info_list):
                     print(f"[Parser-WARNING] block_info_list 的长度 ({len(block_info_list)}) 小于 content_list 的长度 ({len(content_list)})。后续块将使用默认 page_idx 和 bbox。")
 
-            if chunk_data["type"] == "text" or chunk_data["type"] == "table":
+            if chunk_data["type"] == "text" or chunk_data["type"] == "table" or chunk_data["type"] == "equation":
                 if chunk_data["type"] == "text":
                     content = chunk_data["text"]
                     if not content or not content.strip():
                         continue
                     # 过滤 markdown 特殊符号
                     content = re.sub(r"[!#\\$/]", "", content)
+                elif chunk_data["type"] == "equation":
+                    content = chunk_data["text"]
+                    if not content or not content.strip():
+                        continue
                 elif chunk_data["type"] == "table":
                     caption_list = chunk_data.get("table_caption", [])  # 获取列表，默认为空列表
                     table_body = chunk_data.get("table_body", "")  # 获取表格主体，默认为空字符串