feat(document_parser): 支持解析数学公式类型的数据块

- 在文本和表格类型之外,增加了对数学公式(equation)类型数据块的处理
This commit is contained in:
zstar 2025-06-02 13:47:15 +08:00
parent 9e28a5372b
commit dd2b661703
1 changed files with 5 additions and 1 deletions

View File

@ -524,13 +524,17 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
if chunk_idx == len(block_info_list): if chunk_idx == len(block_info_list):
print(f"[Parser-WARNING] block_info_list 的长度 ({len(block_info_list)}) 小于 content_list 的长度 ({len(content_list)})。后续块将使用默认 page_idx 和 bbox。") print(f"[Parser-WARNING] block_info_list 的长度 ({len(block_info_list)}) 小于 content_list 的长度 ({len(content_list)})。后续块将使用默认 page_idx 和 bbox。")
if chunk_data["type"] == "text" or chunk_data["type"] == "table": if chunk_data["type"] == "text" or chunk_data["type"] == "table" or chunk_data["type"] == "equation":
if chunk_data["type"] == "text": if chunk_data["type"] == "text":
content = chunk_data["text"] content = chunk_data["text"]
if not content or not content.strip(): if not content or not content.strip():
continue continue
# 过滤 markdown 特殊符号 # 过滤 markdown 特殊符号
content = re.sub(r"[!#\\$/]", "", content) content = re.sub(r"[!#\\$/]", "", content)
elif chunk_data["type"] == "equation":
content = chunk_data["text"]
if not content or not content.strip():
continue
elif chunk_data["type"] == "table": elif chunk_data["type"] == "table":
caption_list = chunk_data.get("table_caption", []) # 获取列表,默认为空列表 caption_list = chunk_data.get("table_caption", []) # 获取列表,默认为空列表
table_body = chunk_data.get("table_body", "") # 获取表格主体,默认为空字符串 table_body = chunk_data.get("table_body", "") # 获取表格主体,默认为空字符串