From dd2b661703bae85e2b9d0b049c7071ff6c6f013f Mon Sep 17 00:00:00 2001 From: zstar <65890619+zstar1003@users.noreply.github.com> Date: Mon, 2 Jun 2025 13:47:15 +0800 Subject: [PATCH] =?UTF-8?q?feat(document=5Fparser):=20=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=E6=95=B0=E5=AD=A6=E5=85=AC=E5=BC=8F=E7=B1=BB?= =?UTF-8?q?=E5=9E=8B=E7=9A=84=E6=95=B0=E6=8D=AE=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在文本和表格类型之外,增加了对数学公式(equation)类型数据块的处理 --- .../server/services/knowledgebases/document_parser.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/management/server/services/knowledgebases/document_parser.py b/management/server/services/knowledgebases/document_parser.py index 48e3ee8..92c4f18 100644 --- a/management/server/services/knowledgebases/document_parser.py +++ b/management/server/services/knowledgebases/document_parser.py @@ -524,13 +524,17 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info): if chunk_idx == len(block_info_list): print(f"[Parser-WARNING] block_info_list 的长度 ({len(block_info_list)}) 小于 content_list 的长度 ({len(content_list)})。后续块将使用默认 page_idx 和 bbox。") - if chunk_data["type"] == "text" or chunk_data["type"] == "table": + if chunk_data["type"] == "text" or chunk_data["type"] == "table" or chunk_data["type"] == "equation": if chunk_data["type"] == "text": content = chunk_data["text"] if not content or not content.strip(): continue # 过滤 markdown 特殊符号 content = re.sub(r"[!#\\$/]", "", content) + elif chunk_data["type"] == "equation": + content = chunk_data["text"] + if not content or not content.strip(): + continue elif chunk_data["type"] == "table": caption_list = chunk_data.get("table_caption", []) # 获取列表,默认为空列表 table_body = chunk_data.get("table_body", "") # 获取表格主体,默认为空字符串