import os import mysql.connector import re from io import BytesIO from minio import Minio from dotenv import load_dotenv from werkzeug.utils import secure_filename from datetime import datetime from .utils import FileType, FileSource, StatusEnum, get_uuid from .document_service import DocumentService from .file_service import FileService from .file2document_service import File2DocumentService from database import DB_CONFIG, MINIO_CONFIG # 加载环境变量 load_dotenv("../../docker/.env") UPLOAD_FOLDER = '/data/uploads' ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'jpg', 'jpeg', 'png', 'txt', 'md'} def allowed_file(filename): """Check if the file extension is allowed""" return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def filename_type(filename): """根据文件名确定文件类型""" ext = os.path.splitext(filename)[1].lower() if ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']: return FileType.VISUAL.value elif ext in ['.pdf']: return FileType.PDF.value elif ext in ['.doc', '.docx']: return FileType.WORD.value elif ext in ['.xls', '.xlsx']: return FileType.EXCEL.value elif ext in ['.ppt', '.pptx']: return FileType.PPT.value elif ext in ['.txt', '.md']: # 添加对 txt 和 md 文件的支持 return FileType.TEXT.value return FileType.OTHER.value def get_minio_client(): """创建MinIO客户端""" return Minio( endpoint=MINIO_CONFIG["endpoint"], access_key=MINIO_CONFIG["access_key"], secret_key=MINIO_CONFIG["secret_key"], secure=MINIO_CONFIG["secure"] ) def get_db_connection(): """创建数据库连接""" return mysql.connector.connect(**DB_CONFIG) def get_files_list(current_page, page_size, name_filter=""): """ 获取文件列表 Args: current_page: 当前页码 page_size: 每页大小 name_filter: 文件名过滤条件 Returns: tuple: (文件列表, 总数) """ try: # 计算偏移量 offset = (current_page - 1) * page_size # 连接数据库 conn = get_db_connection() cursor = conn.cursor(dictionary=True) # 构建查询条件 where_clause = "" params = [] if name_filter: where_clause = "WHERE d.name LIKE %s" params.append(f"%{name_filter}%") # 查询总数 count_query = f""" SELECT COUNT(*) as total FROM document d {where_clause} """ cursor.execute(count_query, params) total = cursor.fetchone()['total'] # 查询文件列表 query = f""" SELECT d.id, d.name, d.kb_id, d.location, d.size, d.type, d.create_time FROM document d {where_clause} ORDER BY d.create_time DESC LIMIT %s OFFSET %s """ cursor.execute(query, params + [page_size, offset]) documents = cursor.fetchall() # 获取文档与文件的关联信息 doc_ids = [doc['id'] for doc in documents] file_mappings = {} if doc_ids: placeholders = ', '.join(['%s'] * len(doc_ids)) cursor.execute(f""" SELECT f2d.document_id, f.id as file_id, f.parent_id, f.source_type FROM file2document f2d JOIN file f ON f2d.file_id = f.id WHERE f2d.document_id IN ({placeholders}) """, doc_ids) for row in cursor.fetchall(): file_mappings[row['document_id']] = { 'file_id': row['file_id'], 'parent_id': row['parent_id'], 'source_type': row['source_type'] } # 整合信息 result = [] for doc in documents: doc_id = doc['id'] kb_id = doc['kb_id'] location = doc['location'] # 确定存储位置 storage_bucket = kb_id storage_location = location # 如果有文件映射,检查是否需要使用文件的parent_id作为bucket if doc_ids and doc_id in file_mappings: file_info = file_mappings[doc_id] # 模拟File2DocumentService.get_storage_address的逻辑 if file_info.get('source_type') is None or file_info.get('source_type') == 0: # LOCAL storage_bucket = file_info['parent_id'] # 构建结果字典 result_item = { 'id': doc_id, 'name': doc.get('name', ''), 'kb_id': kb_id, 'size': doc.get('size', 0), 'type': doc.get('type', ''), 'location': location, 'create_time': doc.get('create_time', 0) } result.append(result_item) cursor.close() conn.close() return result, total except Exception as e: raise e def get_file_info(file_id): """ 获取文件信息 Args: file_id: 文件ID Returns: tuple: (文档信息, 文件映射信息, 存储桶, 存储位置) """ try: # 连接数据库 conn = get_db_connection() cursor = conn.cursor(dictionary=True) # 查询文档信息 cursor.execute(""" SELECT d.id, d.name, d.kb_id, d.location, d.type FROM document d WHERE d.id = %s """, (file_id,)) document = cursor.fetchone() if not document: cursor.close() conn.close() return None, None, None, None # 获取文档与文件的关联信息 cursor.execute(""" SELECT f2d.document_id, f.id as file_id, f.parent_id, f.source_type FROM file2document f2d JOIN file f ON f2d.file_id = f.id WHERE f2d.document_id = %s """, (file_id,)) file_mapping = cursor.fetchone() # 确定存储位置 storage_bucket = document['kb_id'] storage_location = document['location'] # 如果有文件映射,检查是否需要使用文件的parent_id作为bucket if file_mapping: # 模拟File2DocumentService.get_storage_address的逻辑 if file_mapping.get('source_type') is None or file_mapping.get('source_type') == 0: # LOCAL storage_bucket = file_mapping['parent_id'] cursor.close() conn.close() return document, file_mapping, storage_bucket, storage_location except Exception as e: raise e def download_file_from_minio(storage_bucket, storage_location): """ 从MinIO下载文件 Args: storage_bucket: 存储桶 storage_location: 存储位置 Returns: bytes: 文件数据 """ try: # 从MinIO下载文件 minio_client = get_minio_client() # 检查bucket是否存在 if not minio_client.bucket_exists(storage_bucket): raise Exception(f"存储桶 {storage_bucket} 不存在") # 下载文件 response = minio_client.get_object(storage_bucket, storage_location) file_data = response.read() return file_data except Exception as e: raise e def delete_file(file_id): """ 删除文件 Args: file_id: 文件ID Returns: bool: 是否删除成功 """ try: # 获取文件信息 document, file_mapping, storage_bucket, storage_location = get_file_info(file_id) if not document: return False # 连接数据库 conn = get_db_connection() cursor = conn.cursor(dictionary=True) # 如果有文件映射,获取文件ID file_id_to_delete = None if file_mapping: file_id_to_delete = file_mapping['file_id'] # 开始事务 conn.start_transaction() try: # 1. 删除document表中的记录 cursor.execute("DELETE FROM document WHERE id = %s", (file_id,)) # 2. 如果有关联的file2document记录,删除它 if file_mapping: cursor.execute("DELETE FROM file2document WHERE document_id = %s", (file_id,)) # 3. 如果有关联的file记录,删除它 if file_id_to_delete: cursor.execute("DELETE FROM file WHERE id = %s", (file_id_to_delete,)) # 提交事务 conn.commit() # 从MinIO删除文件 try: minio_client = get_minio_client() # 检查bucket是否存在 if minio_client.bucket_exists(storage_bucket): # 删除文件 minio_client.remove_object(storage_bucket, storage_location) except Exception as e: # 即使MinIO删除失败,也不影响数据库操作的成功 print(f"从MinIO删除文件失败: {str(e)}") return True except Exception as e: # 回滚事务 conn.rollback() raise e finally: cursor.close() conn.close() except Exception as e: raise e def batch_delete_files(file_ids): """ 批量删除文件 Args: file_ids: 文件ID列表 Returns: int: 成功删除的文件数量 """ if not file_ids: return 0 try: # 连接数据库 conn = get_db_connection() cursor = conn.cursor(dictionary=True) # 创建MinIO客户端 minio_client = get_minio_client() # 开始事务 conn.start_transaction() try: success_count = 0 for file_id in file_ids: # 查询文档信息 cursor.execute(""" SELECT d.id, d.kb_id, d.location FROM document d WHERE d.id = %s """, (file_id,)) document = cursor.fetchone() if not document: continue # 获取文档与文件的关联信息 cursor.execute(""" SELECT f2d.id as f2d_id, f2d.document_id, f2d.file_id, f.parent_id, f.source_type FROM file2document f2d JOIN file f ON f2d.file_id = f.id WHERE f2d.document_id = %s """, (file_id,)) file_mapping = cursor.fetchone() # 确定存储位置 storage_bucket = document['kb_id'] storage_location = document['location'] # 如果有文件映射,检查是否需要使用文件的parent_id作为bucket file_id_to_delete = None if file_mapping: file_id_to_delete = file_mapping['file_id'] # 模拟File2DocumentService.get_storage_address的逻辑 if file_mapping.get('source_type') is None or file_mapping.get('source_type') == 0: # LOCAL storage_bucket = file_mapping['parent_id'] # 1. 删除document表中的记录 cursor.execute("DELETE FROM document WHERE id = %s", (file_id,)) # 2. 如果有关联的file2document记录,删除它 if file_mapping: cursor.execute("DELETE FROM file2document WHERE id = %s", (file_mapping['f2d_id'],)) # 3. 如果有关联的file记录,删除它 if file_id_to_delete: cursor.execute("DELETE FROM file WHERE id = %s", (file_id_to_delete,)) # 从MinIO删除文件 try: # 检查bucket是否存在 if minio_client.bucket_exists(storage_bucket): # 删除文件 minio_client.remove_object(storage_bucket, storage_location) except Exception as e: # 即使MinIO删除失败,也不影响数据库操作的成功 print(f"从MinIO删除文件失败: {str(e)}") success_count += 1 # 提交事务 conn.commit() return success_count except Exception as e: # 回滚事务 conn.rollback() raise e finally: cursor.close() conn.close() except Exception as e: raise e def upload_files_to_server(files, kb_id=None, user_id=None, parent_id=None): """处理文件上传到服务器的核心逻辑""" if user_id is None: try: conn = get_db_connection() cursor = conn.cursor(dictionary=True) # 查询创建时间最早的用户ID query_earliest_user = """ SELECT id FROM user WHERE create_time = (SELECT MIN(create_time) FROM user) LIMIT 1 """ cursor.execute(query_earliest_user) earliest_user = cursor.fetchone() if earliest_user: user_id = earliest_user['id'] print(f"使用创建时间最早的用户ID: {user_id}") else: user_id = 'system' print("未找到用户, 使用默认用户ID: system") cursor.close() conn.close() except Exception as e: print(f"查询最早用户ID失败: {str(e)}") user_id = 'system' # 如果没有指定parent_id,则获取用户的根文件夹ID if parent_id is None: try: conn = get_db_connection() cursor = conn.cursor(dictionary=True) # 查询用户的根文件夹 query_root_folder = """ SELECT id FROM file WHERE tenant_id = %s AND parent_id = id LIMIT 1 """ cursor.execute(query_root_folder, (user_id,)) root_folder = cursor.fetchone() if root_folder: parent_id = root_folder['id'] print(f"使用用户根文件夹ID: {parent_id}") else: # 如果没有找到根文件夹,创建一个 root_id = get_uuid() # 修改时间格式,包含时分秒 current_time = int(datetime.now().timestamp()) current_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') root_folder = { "id": root_id, "parent_id": root_id, # 根文件夹的parent_id指向自己 "tenant_id": user_id, "created_by": user_id, "name": "/", "type": FileType.FOLDER.value, "size": 0, "location": "", "source_type": FileSource.LOCAL.value, "create_time": current_time, "create_date": current_date, "update_time": current_time, "update_date": current_date } FileService.insert(root_folder) parent_id = root_id print(f"创建并使用新的根文件夹ID: {parent_id}") cursor.close() conn.close() except Exception as e: print(f"查询根文件夹ID失败: {str(e)}") # 如果无法获取根文件夹,使用file_bucket_id作为备选 parent_id = None results = [] for file in files: if file.filename == '': continue if file and allowed_file(file.filename): # 为每个文件生成独立的存储桶名称 file_bucket_id = FileService.generate_bucket_name() original_filename = file.filename # 修复文件名处理逻辑,保留中文字符 name, ext = os.path.splitext(original_filename) # 只替换文件系统不安全的字符,保留中文和其他Unicode字符 safe_name = re.sub(r'[\\/:*?"<>|]', '_', name) # 如果处理后文件名为空,则使用随机字符串 if not safe_name or safe_name.strip() == '': safe_name = f"file_{get_uuid()[:8]}" filename = safe_name + ext.lower() filepath = os.path.join(UPLOAD_FOLDER, filename) try: # 1. 保存文件到本地临时目录 os.makedirs(UPLOAD_FOLDER, exist_ok=True) file.save(filepath) print(f"文件已保存到临时目录: {filepath}") print(f"原始文件名: {original_filename}, 处理后文件名: {filename}, 扩展名: {ext[1:]}") # 修改打印信息 # 2. 获取文件类型 - 使用修复后的文件名 filetype = filename_type(filename) if filetype == FileType.OTHER.value: raise RuntimeError("不支持的文件类型") # 3. 生成唯一存储位置 minio_client = get_minio_client() location = filename # 确保bucket存在(使用文件独立的bucket) if not minio_client.bucket_exists(file_bucket_id): minio_client.make_bucket(file_bucket_id) print(f"创建MinIO存储桶: {file_bucket_id}") # 4. 上传到MinIO(使用文件独立的bucket) with open(filepath, 'rb') as file_data: minio_client.put_object( bucket_name=file_bucket_id, object_name=location, data=file_data, length=os.path.getsize(filepath) ) print(f"文件已上传到MinIO: {file_bucket_id}/{location}") # 5. 创建缩略图(如果是图片/PDF等) thumbnail_location = '' if filetype in [FileType.VISUAL.value, FileType.PDF.value]: try: thumbnail_location = f'thumbnail_{get_uuid()}.png' except Exception as e: print(f"生成缩略图失败: {str(e)}") # 6. 创建数据库记录 doc_id = get_uuid() # 修改时间格式,包含时分秒 current_time = int(datetime.now().timestamp()) current_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') doc = { "id": doc_id, "kb_id": file_bucket_id, # 使用文件独立的bucket_id "parser_id": FileService.get_parser(filetype, filename, ""), "parser_config": {"pages": [[1, 1000000]]}, "source_type": "local", "created_by": user_id or 'system', "type": filetype, "name": filename, "location": location, "size": os.path.getsize(filepath), "thumbnail": thumbnail_location, "token_num": 0, "chunk_num": 0, "progress": 0, "progress_msg": "", "run": "0", "status": StatusEnum.VALID.value, "create_time": current_time, "create_date": current_date, "update_time": current_time, "update_date": current_date } # 7. 保存文档记录 (添加事务处理) conn = get_db_connection() try: cursor = conn.cursor() DocumentService.insert(doc) print(f"文档记录已保存到MySQL: {doc_id}") # 8. 创建文件记录和关联 file_record = { "id": get_uuid(), "parent_id": parent_id or file_bucket_id, # 优先使用指定的parent_id "tenant_id": user_id or 'system', "created_by": user_id or 'system', "name": filename, "type": filetype, "size": doc["size"], "location": location, "source_type": FileSource.KNOWLEDGEBASE.value, "create_time": current_time, "create_date": current_date, "update_time": current_time, "update_date": current_date } FileService.insert(file_record) print(f"文件记录已保存到MySQL: {file_record['id']}") # 9. 创建文件-文档关联 File2DocumentService.insert({ "id": get_uuid(), "file_id": file_record["id"], "document_id": doc_id, "create_time": current_time, "create_date": current_date, "update_time": current_time, "update_date": current_date }) print(f"关联记录已保存到MySQL: {file_record['id']} -> {doc_id}") conn.commit() results.append({ 'id': doc_id, 'name': filename, 'size': doc["size"], 'type': filetype, 'status': 'success' }) except Exception as e: conn.rollback() print(f"数据库操作失败: {str(e)}") raise finally: cursor.close() conn.close() except Exception as e: results.append({ 'name': filename, 'error': str(e), 'status': 'failed' }) print(f"文件上传过程中出错: {filename}, 错误: {str(e)}") finally: # 删除临时文件 if os.path.exists(filepath): os.remove(filepath) return { 'code': 0, 'data': results, 'message': f'成功上传 {len([r for r in results if r["status"] == "success"])}/{len(files)} 个文件' }