From 2249ef3083223b0522b22cf4985a4187047284e8 Mon Sep 17 00:00:00 2001 From: zstar <65890619+zstar1003@users.noreply.github.com> Date: Tue, 10 Jun 2025 11:37:03 +0800 Subject: [PATCH] =?UTF-8?q?fix(document=5Fparser):=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E4=B8=AD=E5=AD=98=E5=9C=A8=E6=97=A0=E6=95=88?= =?UTF-8?q?=E7=9A=84Unicode=E5=AD=97=E7=AC=A6=E5=AF=BC=E8=87=B4=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=E5=BC=82=E5=B8=B8=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 在将文档内容编码为 UTF-8 时,添加了错误处理参数 "replace" - 这样可以避免某些特殊字符导致的编码错误 - 修改了两处相关代码,确保内容正确上传到 MinIO --- management/server/services/knowledgebases/document_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/management/server/services/knowledgebases/document_parser.py b/management/server/services/knowledgebases/document_parser.py index 4951e0f..eb0dac5 100644 --- a/management/server/services/knowledgebases/document_parser.py +++ b/management/server/services/knowledgebases/document_parser.py @@ -550,8 +550,8 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info): minio_client.put_object( bucket_name=output_bucket, object_name=chunk_id, - data=BytesIO(content.encode("utf-8")), - length=len(content.encode("utf-8")), # 使用字节长度 + data=BytesIO(content.encode("utf-8", errors="replace")), + length=len(content.encode("utf-8", errors="replace")), ) # 准备ES文档