From 0b1126b1c88fa741809661cb738e50ebf1ce24bb Mon Sep 17 00:00:00 2001 From: zstar <65890619+zstar1003@users.noreply.github.com> Date: Thu, 12 Jun 2025 22:50:19 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E5=AF=B9csv=E6=A0=BC=E5=BC=8F=E6=96=87=E4=BB=B6=E7=9A=84?= =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E5=92=8C=E8=A7=A3=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- management/server/routes/files/routes.py | 8 +-- .../server/services/files/document_service.py | 52 +++++++------- .../server/services/files/file_service.py | 20 ++---- management/server/services/files/service.py | 13 ++-- management/server/services/files/utils.py | 9 ++- .../knowledgebases/document_parser.py | 4 +- .../services/knowledgebases/excel_parser.py | 71 +++++++++++++++---- management/server/utils.py | 24 +++---- 8 files changed, 122 insertions(+), 79 deletions(-) diff --git a/management/server/routes/files/routes.py b/management/server/routes/files/routes.py index 8b63d99..1eff273 100644 --- a/management/server/routes/files/routes.py +++ b/management/server/routes/files/routes.py @@ -1,11 +1,11 @@ -from flask import jsonify, request, send_file, current_app from io import BytesIO -from .. import files_bp - -from services.files.service import get_files_list, get_file_info, download_file_from_minio, delete_file, batch_delete_files, handle_chunk_upload, merge_chunks, upload_files_to_server +from flask import current_app, jsonify, request, send_file +from services.files.service import batch_delete_files, delete_file, download_file_from_minio, get_file_info, get_files_list, handle_chunk_upload, merge_chunks, upload_files_to_server from services.files.utils import FileType +from .. import files_bp + UPLOAD_FOLDER = "/data/uploads" ALLOWED_EXTENSIONS = {"txt", "pdf", "png", "jpg", "jpeg", "gif", "doc", "docx", "xls", "xlsx"} diff --git a/management/server/services/files/document_service.py b/management/server/services/files/document_service.py index 721f018..bd405d9 100644 --- a/management/server/services/files/document_service.py +++ b/management/server/services/files/document_service.py @@ -1,16 +1,18 @@ -from peewee import * +from peewee import * # noqa: F403 + from .base_service import BaseService from .models import Document -from .utils import get_uuid, StatusEnum +from .utils import StatusEnum, get_uuid + class DocumentService(BaseService): model = Document - + @classmethod def create_document(cls, kb_id: str, name: str, location: str, size: int, file_type: str, created_by: str = None, parser_id: str = None, parser_config: dict = None) -> Document: """ 创建文档记录 - + Args: kb_id: 知识库ID name: 文件名 @@ -20,34 +22,34 @@ class DocumentService(BaseService): created_by: 创建者ID parser_id: 解析器ID parser_config: 解析器配置 - + Returns: Document: 创建的文档对象 """ doc_id = get_uuid() - + # 构建基本文档数据 doc_data = { - 'id': doc_id, - 'kb_id': kb_id, - 'name': name, - 'location': location, - 'size': size, - 'type': file_type, - 'created_by': created_by or 'system', - 'parser_id': parser_id or '', - 'parser_config': parser_config or {"pages": [[1, 1000000]]}, - 'source_type': 'local', - 'token_num': 0, - 'chunk_num': 0, - 'progress': 0, - 'progress_msg': '', - 'run': '0', # 未开始解析 - 'status': StatusEnum.VALID.value + "id": doc_id, + "kb_id": kb_id, + "name": name, + "location": location, + "size": size, + "type": file_type, + "created_by": created_by or "system", + "parser_id": parser_id or "", + "parser_config": parser_config or {"pages": [[1, 1000000]]}, + "source_type": "local", + "token_num": 0, + "chunk_num": 0, + "progress": 0, + "progress_msg": "", + "run": "0", # 未开始解析 + "status": StatusEnum.VALID.value, } - + return cls.insert(doc_data) - + @classmethod def get_by_kb_id(cls, kb_id: str) -> list[Document]: - return cls.query(kb_id=kb_id) \ No newline at end of file + return cls.query(kb_id=kb_id) diff --git a/management/server/services/files/file_service.py b/management/server/services/files/file_service.py index 866f264..222ce48 100644 --- a/management/server/services/files/file_service.py +++ b/management/server/services/files/file_service.py @@ -1,26 +1,20 @@ from peewee import * # noqa: F403 + from .base_service import BaseService from .models import File from .utils import FileType, get_uuid + class FileService(BaseService): model = File - + @classmethod def create_file(cls, parent_id: str, name: str, location: str, size: int, file_type: str) -> File: - return cls.insert({ - 'parent_id': parent_id, - 'name': name, - 'location': location, - 'size': size, - 'type': file_type, - 'source_type': 'knowledgebase' - }) - + return cls.insert({"parent_id": parent_id, "name": name, "location": location, "size": size, "type": file_type, "source_type": "knowledgebase"}) + @classmethod def get_parser(cls, file_type, filename, tenant_id): """获取适合文件类型的解析器ID""" - # 这里可能需要根据实际情况调整 if file_type == FileType.PDF.value: return "pdf_parser" elif file_type == FileType.WORD.value: @@ -40,7 +34,7 @@ class FileService(BaseService): def get_by_parent_id(cls, parent_id: str) -> list[File]: return cls.query(parent_id=parent_id) - @classmethod + @classmethod def generate_bucket_name(cls): """生成随机存储桶名称""" - return f"kb-{get_uuid()}" \ No newline at end of file + return f"kb-{get_uuid()}" diff --git a/management/server/services/files/service.py b/management/server/services/files/service.py index a2fc86b..471dccc 100644 --- a/management/server/services/files/service.py +++ b/management/server/services/files/service.py @@ -1,13 +1,14 @@ import os -import shutil import re +import shutil import tempfile -from dotenv import load_dotenv from datetime import datetime from pathlib import Path -from database import get_db_connection, get_minio_client, get_redis_connection -from .utils import FileType, FileSource, get_uuid +from database import get_db_connection, get_minio_client, get_redis_connection +from dotenv import load_dotenv + +from .utils import FileSource, FileType, get_uuid # 加载环境变量 load_dotenv("../../docker/.env") @@ -18,7 +19,7 @@ CHUNK_EXPIRY_SECONDS = 3600 * 24 # 分块24小时过期 temp_dir = tempfile.gettempdir() UPLOAD_FOLDER = os.path.join(temp_dir, "uploads") -ALLOWED_EXTENSIONS = {"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "jpg", "jpeg", "png", "bmp", "txt", "md", "html"} +ALLOWED_EXTENSIONS = {"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "jpg", "jpeg", "png", "bmp", "txt", "md", "html", "csv"} def allowed_file(filename): @@ -36,7 +37,7 @@ def filename_type(filename): return FileType.PDF.value elif ext in [".doc", ".docx"]: return FileType.WORD.value - elif ext in [".xls", ".xlsx"]: + elif ext in [".xls", ".xlsx", ".csv"]: return FileType.EXCEL.value elif ext in [".ppt", ".pptx"]: return FileType.PPT.value diff --git a/management/server/services/files/utils.py b/management/server/services/files/utils.py index ec55a54..0bf5f26 100644 --- a/management/server/services/files/utils.py +++ b/management/server/services/files/utils.py @@ -1,9 +1,9 @@ import uuid -from strenum import StrEnum from enum import Enum +from strenum import StrEnum + -# 参考:api.db class FileType(StrEnum): FOLDER = "folder" PDF = "pdf" @@ -15,15 +15,18 @@ class FileType(StrEnum): HTML = "html" OTHER = "other" + class FileSource(StrEnum): LOCAL = "" KNOWLEDGEBASE = "knowledgebase" S3 = "s3" + class StatusEnum(Enum): VALID = "1" INVALID = "0" + # 参考:api.utils def get_uuid(): - return uuid.uuid1().hex \ No newline at end of file + return uuid.uuid1().hex diff --git a/management/server/services/knowledgebases/document_parser.py b/management/server/services/knowledgebases/document_parser.py index 87aa349..e34363c 100644 --- a/management/server/services/knowledgebases/document_parser.py +++ b/management/server/services/knowledgebases/document_parser.py @@ -20,7 +20,7 @@ from magic_pdf.data.read_api import read_local_images, read_local_office from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from . import logger -from .excel_parser import parse_excel +from .excel_parser import parse_excel_file from .rag_tokenizer import RagTokenizer from .utils import _create_task_record, _update_document_progress, _update_kb_chunk_count, generate_uuid, get_bbox_from_block @@ -196,7 +196,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info): update_progress(0.8, "提取内容") # 处理内容列表 - content_list = parse_excel(temp_file_path) + content_list = parse_excel_file(temp_file_path) elif file_type.endswith("visual"): update_progress(0.3, "使用MinerU解析器") diff --git a/management/server/services/knowledgebases/excel_parser.py b/management/server/services/knowledgebases/excel_parser.py index fa1f4f8..d951a1e 100644 --- a/management/server/services/knowledgebases/excel_parser.py +++ b/management/server/services/knowledgebases/excel_parser.py @@ -1,24 +1,69 @@ +import os + import pandas as pd -def parse_excel(file_path): - all_sheets = pd.read_excel(file_path, sheet_name=None) - +def parse_excel_file(file_path): + """ + 通用表格解析函数,支持 Excel (.xlsx/.xls) 和 CSV 文件 + 返回统一格式的数据块列表 + """ blocks = [] - for sheet_name, df in all_sheets.items(): - df = df.ffill() - headers = df.columns.tolist() + # 根据文件扩展名选择读取方式 + file_ext = os.path.splitext(file_path)[1].lower() - for _, row in df.iterrows(): - html_table = "{}{}
".format("".join(f"{col}" for col in headers), "".join(f"{row[col]}" for col in headers)) - block = {"type": "table", "img_path": "", "table_caption": [f"Sheet: {sheet_name}"], "table_footnote": [], "table_body": f"{html_table}", "page_idx": 0} - blocks.append(block) + try: + if file_ext in (".xlsx", ".xls"): + # 处理Excel文件(多sheet) + all_sheets = pd.read_excel(file_path, sheet_name=None) + for sheet_name, df in all_sheets.items(): + blocks.extend(_process_dataframe(df, sheet_name)) + + elif file_ext == ".csv": + # 处理CSV文件(单sheet) + df = pd.read_csv(file_path) + blocks.extend(_process_dataframe(df, "CSV")) + + else: + raise ValueError(f"Unsupported file format: {file_ext}") + + except Exception as e: + raise ValueError(f"Failed to parse file {file_path}: {str(e)}") + + return blocks + + +def _process_dataframe(df, sheet_name): + """处理单个DataFrame,生成统一格式的数据块""" + df = df.ffill() + headers = df.columns.tolist() + blocks = [] + + for _, row in df.iterrows(): + html_table = "{}{}
".format("".join(f"{col}" for col in headers), "".join(f"{row[col]}" for col in headers)) + + block = {"type": "table", "img_path": "", "table_caption": "", "table_footnote": [], "table_body": html_table, "page_idx": 0} + blocks.append(block) return blocks if __name__ == "__main__": - file_path = "test_excel.xls" - parse_excel_result = parse_excel(file_path) - print(parse_excel_result) + # 测试示例 + excel_path = "test.xlsx" + csv_path = "test.csv" + + try: + # 测试Excel解析 + excel_blocks = parse_excel_file(excel_path) + print(f"Excel解析结果(共{len(excel_blocks)}条):") + print(excel_blocks[:1]) # 打印第一条示例 + + # 测试CSV解析 + csv_blocks = parse_excel_file(csv_path) + print(f"\nCSV解析结果(共{len(csv_blocks)}条):") + print(csv_blocks[:1]) + + except Exception as e: + print(f"错误: {str(e)}") diff --git a/management/server/utils.py b/management/server/utils.py index 0240f32..220072c 100644 --- a/management/server/utils.py +++ b/management/server/utils.py @@ -1,8 +1,9 @@ -import uuid import base64 -from flask import jsonify -from Cryptodome.PublicKey import RSA +import uuid + from Cryptodome.Cipher import PKCS1_v1_5 +from Cryptodome.PublicKey import RSA +from flask import jsonify from werkzeug.security import generate_password_hash @@ -10,6 +11,7 @@ from werkzeug.security import generate_password_hash def generate_uuid(): return str(uuid.uuid4()).replace("-", "") + # RSA 加密密码 def rsa_psw(password: str) -> str: pub_key = """-----BEGIN PUBLIC KEY----- @@ -21,26 +23,22 @@ def rsa_psw(password: str) -> str: encrypted_data = cipher.encrypt(base64.b64encode(password.encode())) return base64.b64encode(encrypted_data).decode() + # 加密密码 def encrypt_password(raw_password: str) -> str: base64_password = base64.b64encode(raw_password.encode()).decode() return generate_password_hash(base64_password) + # 标准响应格式 def success_response(data=None, message="操作成功", code=0): - return jsonify({ - "code": code, - "message": message, - "data": data - }) + return jsonify({"code": code, "message": message, "data": data}) + # 错误响应格式 def error_response(message="操作失败", code=500, details=None): """标准错误响应格式""" - response = { - "code": code, - "message": message - } + response = {"code": code, "message": message} if details: response["details"] = details - return jsonify(response), code if code >= 400 else 500 \ No newline at end of file + return jsonify(response), code if code >= 400 else 500