refactor(management): 后台解析时,添加日志记录

- 在管理端应用中添加日志记录功能,用于记录解析过程中的信息和错误
- 优化代码格式和结构,提高可读性和可维护性
- 在.docker-compose.yml中添加日志目录挂载
- 清理无用的环境变量加载代码
This commit is contained in:
zstar 2025-06-10 12:29:26 +08:00
parent 2249ef3083
commit 59d5ca5c95
7 changed files with 110 additions and 98 deletions

1
.gitignore vendored
View File

@ -55,3 +55,4 @@ management/web/types/auto
web/node_modules/.cache/logger/umi.log
management/models--slanet_plus
node_modules/.cache/logger/umi.log
*.log

View File

@ -1,15 +1,7 @@
import os
from pathlib import Path
from dotenv import load_dotenv
from minio import Minio
from api.root_path import get_root_folder
# 加载环境变量
env_path = Path(get_root_folder()) / "docker" / ".env"
load_dotenv(env_path)
# 检测是否在Docker容器中运行
def is_running_in_docker():

View File

@ -19,6 +19,8 @@ services:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf
- ../api/db/services/database.py:/ragflow/api/db/services/database.py
- ../api/db/services/dialog_service.py:/ragflow/api/db/services/dialog_service.py
- ../api/ragflow_server.py:/ragflow/api/ragflow_server.py
- ../api/root_path.py:/ragflow/api/root_path.py
env_file: .env
environment:
- TZ=${TIMEZONE}
@ -57,6 +59,7 @@ services:
ports:
- "5000:5000"
volumes:
- ./ragflow-plus-logs:/app/logs
- ./magic-pdf.json:/root/magic-pdf.json
depends_on:
mysql:

View File

@ -1,70 +1,79 @@
import jwt
import logging
import os
from datetime import datetime, timedelta
import jwt
from dotenv import load_dotenv
from flask import Flask, request
from flask_cors import CORS
from datetime import datetime, timedelta
from routes import register_routes
from dotenv import load_dotenv
# 加载环境变量
load_dotenv(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'docker', '.env'))
load_dotenv(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "docker", ".env"))
app = Flask(__name__)
# 启用CORS允许前端访问
CORS(app, resources={
r"/api/*": {
"origins": "*",
"methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"],
"allow_headers": ["Content-Type", "Authorization"]
}
})
CORS(app, resources={r"/api/*": {"origins": "*", "methods": ["GET", "POST", "PUT", "DELETE", "OPTIONS"], "allow_headers": ["Content-Type", "Authorization"]}})
# 注册所有路由
register_routes(app)
# 从环境变量获取配置
ADMIN_USERNAME = os.getenv('MANAGEMENT_ADMIN_USERNAME', 'admin')
ADMIN_PASSWORD = os.getenv('MANAGEMENT_ADMIN_PASSWORD', '12345678')
JWT_SECRET = os.getenv('MANAGEMENT_JWT_SECRET', 'your-secret-key')
ADMIN_USERNAME = os.getenv("MANAGEMENT_ADMIN_USERNAME", "admin")
ADMIN_PASSWORD = os.getenv("MANAGEMENT_ADMIN_PASSWORD", "12345678")
JWT_SECRET = os.getenv("MANAGEMENT_JWT_SECRET", "your-secret-key")
# 设置日志目录和文件名
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, "parser.log")
# 配置 logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
handlers=[
logging.FileHandler(log_file, encoding="utf-8"),
logging.StreamHandler(), # 同时也输出到控制台
],
)
# 生成token
def generate_token(username):
# 设置令牌过期时间例如1小时后过期
expire_time = datetime.utcnow() + timedelta(hours=1)
# 生成令牌
token = jwt.encode({
'username': username,
'exp': expire_time
}, JWT_SECRET, algorithm='HS256')
token = jwt.encode({"username": username, "exp": expire_time}, JWT_SECRET, algorithm="HS256")
return token
# 登录路由保留在主文件中
@app.route('/api/v1/auth/login', methods=['POST'])
@app.route("/api/v1/auth/login", methods=["POST"])
def login():
data = request.get_json()
username = data.get('username')
password = data.get('password')
username = data.get("username")
password = data.get("password")
# 创建用户名和密码的映射
valid_users = {
ADMIN_USERNAME: ADMIN_PASSWORD
}
valid_users = {ADMIN_USERNAME: ADMIN_PASSWORD}
# 验证用户名是否存在
if not username or username not in valid_users:
return {"code": 1, "message": "用户名不存在"}, 400
# 验证密码是否正确
if not password or password != valid_users[username]:
return {"code": 1, "message": "密码错误"}, 400
# 生成token
token = generate_token(username)
return {"code": 0, "data": {"token": token}, "message": "登录成功"}
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5000)

View File

@ -1,23 +1,28 @@
import os
import tempfile
import shutil
import json
import logging
import os
import re
import shutil
import tempfile
import time
import traceback
import re
import requests
from io import BytesIO
from datetime import datetime
from database import MINIO_CONFIG, get_minio_client, get_es_client, get_db_connection
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.read_api import read_local_office, read_local_images
from utils import generate_uuid
from io import BytesIO
from urllib.parse import urlparse
from .rag_tokenizer import RagTokenizer
import requests
from database import MINIO_CONFIG, get_db_connection, get_es_client, get_minio_client
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.data.read_api import read_local_images, read_local_office
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from utils import generate_uuid
from .excel_parser import parse_excel
from .rag_tokenizer import RagTokenizer
logger = logging.getLogger(__name__)
tknzr = RagTokenizer()
@ -64,7 +69,7 @@ def _update_document_progress(doc_id, progress=None, message=None, status=None,
cursor.execute(query, params)
conn.commit()
except Exception as e:
print(f"[Parser-ERROR] 更新文档 {doc_id} 进度失败: {e}")
logger.error(f"[Parser-ERROR] 更新文档 {doc_id} 进度失败: {e}")
finally:
if cursor:
cursor.close()
@ -89,7 +94,7 @@ def _update_kb_chunk_count(kb_id, count_delta):
cursor.execute(kb_update, (count_delta, current_date, kb_id))
conn.commit()
except Exception as e:
print(f"[Parser-ERROR] 更新知识库 {kb_id} 块数量失败: {e}")
logger.error(f"[Parser-ERROR] 更新知识库 {kb_id} 块数量失败: {e}")
finally:
if cursor:
cursor.close()
@ -146,10 +151,10 @@ def _create_task_record(doc_id, chunk_ids_list):
task_insert = f"INSERT INTO task ({fields_sql}) VALUES ({placeholders})"
cursor.execute(task_insert, common_values)
conn.commit()
print(f"[Parser-INFO] Task记录创建成功Task ID: {task_id}")
logger.info(f"[Parser-INFO] Task记录创建成功Task ID: {task_id}")
except Exception as e:
print(f"[Parser-ERROR] 创建Task记录失败: {e}")
logger.error(f"[Parser-ERROR] 创建Task记录失败: {e}")
finally:
if cursor:
cursor.close()
@ -173,7 +178,7 @@ def get_bbox_from_block(block):
if isinstance(bbox, list) and len(bbox) == 4 and all(isinstance(n, (int, float)) for n in bbox):
return bbox
else:
print(f"[Parser-WARNING] 块的 bbox 格式无效: {bbox},将使用默认值。")
logger.warning(f"[Parser-WARNING] 块的 bbox 格式无效: {bbox},将使用默认值。")
# 如果 block 不是字典或没有 bbox 键,或 bbox 格式无效,返回默认值
return [0, 0, 0, 0]
@ -212,7 +217,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
# 如果 API 基础地址为空字符串,设置为硅基流动的 API 地址
if embedding_api_base == "":
embedding_api_base = "https://api.siliconflow.cn/v1/embeddings"
print(f"[Parser-INFO] API 基础地址为空,已设置为硅基流动的 API 地址: {embedding_api_base}")
logger.info(f"[Parser-INFO] API 基础地址为空,已设置为硅基流动的 API 地址: {embedding_api_base}")
embedding_api_key = embedding_config.get("api_key") if embedding_config else None # 可能为 None 或空字符串
@ -238,7 +243,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
else:
embedding_url = normalized_base_url + "/v1/embeddings"
print(f"[Parser-INFO] 使用 Embedding 配置: URL='{embedding_url}', Model='{embedding_model_name}', Key={embedding_api_key}")
logger.info(f"[Parser-INFO] 使用 Embedding 配置: URL='{embedding_url}', Model='{embedding_model_name}', Key={embedding_api_key}")
try:
kb_id = doc_info["kb_id"]
@ -252,7 +257,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
# 进度更新回调 (直接调用内部更新函数)
def update_progress(prog=None, msg=None):
_update_document_progress(doc_id, progress=prog, message=msg)
print(f"[Parser-PROGRESS] Doc: {doc_id}, Progress: {prog}, Message: {msg}")
logger.info(f"[Parser-PROGRESS] Doc: {doc_id}, Progress: {prog}, Message: {msg}")
# 1. 从 MinIO 获取文件内容
minio_client = get_minio_client()
@ -310,7 +315,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
with open(temp_file_path, "wb") as f:
f.write(file_content)
print(f"[Parser-INFO] 临时文件路径: {temp_file_path}")
logger.info(f"[Parser-INFO] 临时文件路径: {temp_file_path}")
# 使用MinerU处理
ds = read_local_office(temp_file_path)[0]
infer_result = ds.apply(doc_analyze, ocr=True)
@ -338,7 +343,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
with open(temp_file_path, "wb") as f:
f.write(file_content)
print(f"[Parser-INFO] 临时文件路径: {temp_file_path}")
logger.info(f"[Parser-INFO] 临时文件路径: {temp_file_path}")
update_progress(0.8, "提取内容")
# 处理内容列表
@ -353,7 +358,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
with open(temp_file_path, "wb") as f:
f.write(file_content)
print(f"[Parser-INFO] 临时文件路径: {temp_file_path}")
logger.info(f"[Parser-INFO] 临时文件路径: {temp_file_path}")
# 使用MinerU处理
ds = read_local_images(temp_file_path)[0]
infer_result = ds.apply(doc_analyze, ocr=True)
@ -390,7 +395,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
middle_data = middle_json_content # 直接赋值
else:
middle_data = None
print(f"[Parser-WARNING] middle_json_content 不是预期的字典格式,实际类型: {type(middle_json_content)}")
logger.warning(f"[Parser-WARNING] middle_json_content 不是预期的字典格式,实际类型: {type(middle_json_content)}")
# 提取信息
for page_idx, page_data in enumerate(middle_data.get("pdf_info", [])):
for block in page_data.get("preproc_blocks", []):
@ -399,15 +404,15 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
if block_bbox != [0, 0, 0, 0]:
block_info_list.append({"page_idx": page_idx, "bbox": block_bbox})
else:
print("[Parser-WARNING] 块的 bbox 格式无效,跳过。")
logger.warning("[Parser-WARNING] 块的 bbox 格式无效,跳过。")
print(f"[Parser-INFO] 从 middle_data 提取了 {len(block_info_list)} 个块的信息。")
logger.info(f"[Parser-INFO] 从 middle_data 提取了 {len(block_info_list)} 个块的信息。")
except json.JSONDecodeError:
print("[Parser-ERROR] 解析 middle_json_content 失败。")
logger.error("[Parser-ERROR] 解析 middle_json_content 失败。")
raise Exception("[Parser-ERROR] 解析 middle_json_content 失败。")
except Exception as e:
print(f"[Parser-ERROR] 处理 middle_json_content 时出错: {e}")
logger.error(f"[Parser-ERROR] 处理 middle_json_content 时出错: {e}")
raise Exception(f"[Parser-ERROR] 处理 middle_json_content 时出错: {e}")
# 3. 处理解析结果 (上传到MinIO, 存储到ES)
@ -417,7 +422,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
output_bucket = kb_id
if not minio_client.bucket_exists(output_bucket):
minio_client.make_bucket(output_bucket)
print(f"[Parser-INFO] 创建MinIO桶: {output_bucket}")
logger.info(f"[Parser-INFO] 创建MinIO桶: {output_bucket}")
index_name = f"ragflow_{tenant_id}"
if not es_client.indices.exists(index=index_name):
@ -431,7 +436,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
},
},
)
print(f"[Parser-INFO] 创建Elasticsearch索引: {index_name}")
logger.info(f"[Parser-INFO] 创建Elasticsearch索引: {index_name}")
chunk_count = 0
chunk_ids_list = []
@ -447,13 +452,13 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
bbox = block_info.get("bbox", [0, 0, 0, 0])
# 验证 bbox 是否有效,如果无效则重置为默认值 (可选,取决于是否需要严格验证)
if not (isinstance(bbox, list) and len(bbox) == 4 and all(isinstance(n, (int, float)) for n in bbox)):
print(f"[Parser-WARNING] Chunk {chunk_idx} 对应的 bbox 格式无效: {bbox},将使用默认值。")
logger.info(f"[Parser-WARNING] Chunk {chunk_idx} 对应的 bbox 格式无效: {bbox},将使用默认值。")
bbox = [0, 0, 0, 0]
else:
# 如果 block_info_list 的长度小于 content_list打印警告
# 仅在第一次索引越界时打印一次警告,避免刷屏
if chunk_idx == len(block_info_list):
print(f"[Parser-WARNING] block_info_list 的长度 ({len(block_info_list)}) 小于 content_list 的长度 ({len(content_list)})。后续块将使用默认 page_idx 和 bbox。")
logger.warning(f"[Parser-WARNING] block_info_list 的长度 ({len(block_info_list)}) 小于 content_list 的长度 ({len(content_list)})。后续块将使用默认 page_idx 和 bbox。")
if chunk_data["type"] == "text" or chunk_data["type"] == "table" or chunk_data["type"] == "equation":
if chunk_data["type"] == "text":
@ -531,16 +536,16 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
q_1024_vec = embedding_data.get("embedding")
else:
q_1024_vec = embedding_data["data"][0]["embedding"]
print(f"[Parser-INFO] 获取embedding成功长度: {len(q_1024_vec)}")
# logger.info(f"[Parser-INFO] 获取embedding成功长度: {len(q_1024_vec)}")
# 检查向量维度是否为1024
if len(q_1024_vec) != 1024:
error_msg = f"[Parser-ERROR] Embedding向量维度不是1024实际维度: {len(q_1024_vec)}, 建议使用bge-m3模型"
print(error_msg)
logger.error(error_msg)
update_progress(-5, error_msg)
raise ValueError(error_msg)
except Exception as e:
print(f"[Parser-ERROR] 获取embedding失败: {e}")
logger.error(f"[Parser-ERROR] 获取embedding失败: {e}")
raise Exception(f"[Parser-ERROR] 获取embedding失败: {e}")
chunk_id = generate_uuid()
@ -587,8 +592,8 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
chunk_ids_list.append(chunk_id)
except Exception as e:
print(f"[Parser-ERROR] 处理文本块 {chunk_idx} (page: {page_idx}, bbox: {bbox}) 失败: {e}")
traceback.print_exc() # 打印更详细的错误
logger.error(f"[Parser-ERROR] 处理文本块 {chunk_idx} (page: {page_idx}, bbox: {bbox}) 失败: {e}")
traceback.logger.info_exc() # 打印更详细的错误
raise Exception(f"[Parser-ERROR] 处理文本块 {chunk_idx} (page: {page_idx}, bbox: {bbox}) 失败: {e}")
elif chunk_data["type"] == "image":
@ -598,7 +603,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
img_path_abs = os.path.join(temp_image_dir, os.path.basename(img_path_relative))
if not os.path.exists(img_path_abs):
print(f"[Parser-WARNING] 图片文件不存在: {img_path_abs}")
logger.warning(f"[Parser-WARNING] 图片文件不存在: {img_path_abs}")
continue
img_id = generate_uuid()
@ -616,7 +621,7 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
policy = {"Version": "2012-10-17", "Statement": [{"Effect": "Allow", "Principal": {"AWS": "*"}, "Action": ["s3:GetObject"], "Resource": [f"arn:aws:s3:::{kb_id}/images/*"]}]}
minio_client.set_bucket_policy(kb_id, json.dumps(policy))
print(f"成功上传图片: {img_key}")
logger.info(f"成功上传图片: {img_key}")
minio_endpoint = MINIO_CONFIG["endpoint"]
use_ssl = MINIO_CONFIG.get("secure", False)
protocol = "https" if use_ssl else "http"
@ -629,14 +634,14 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
}
image_info_list.append(image_info)
print(f"图片访问链接: {img_url}")
logger.info(f"图片访问链接: {img_url}")
except Exception as e:
print(f"[Parser-ERROR] 上传图片 {img_path_abs} 失败: {e}")
logger.error(f"[Parser-ERROR] 上传图片 {img_path_abs} 失败: {e}")
raise Exception(f"[Parser-ERROR] 上传图片 {img_path_abs} 失败: {e}")
# 打印匹配总结信息
print(f"[Parser-INFO] 共处理 {chunk_count} 个文本块。")
logger.info(f"[Parser-INFO] 共处理 {chunk_count} 个文本块。")
# 4. 更新文本块的图像信息
if image_info_list and chunk_ids_list:
@ -667,10 +672,10 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
direct_update = {"doc": {"img_id": relative_path}}
es_client.update(index=index_name, id=chunk_id, body=direct_update, refresh=True)
index_name = f"ragflow_{tenant_id}"
print(f"[Parser-INFO] 更新文本块 {chunk_id} 的图片关联: {relative_path}")
logger.info(f"[Parser-INFO] 更新文本块 {chunk_id} 的图片关联: {relative_path}")
except Exception as e:
print(f"[Parser-ERROR] 更新文本块图片关联失败: {e}")
logger.error(f"[Parser-ERROR] 更新文本块图片关联失败: {e}")
raise Exception(f"[Parser-ERROR] 更新文本块图片关联失败: {e}")
finally:
if cursor:
@ -685,16 +690,16 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
_create_task_record(doc_id, chunk_ids_list) # 创建task记录
update_progress(1.0, "解析完成")
print(f"[Parser-INFO] 解析完成文档ID: {doc_id}, 耗时: {process_duration:.2f}s, 块数: {chunk_count}")
logger.info(f"[Parser-INFO] 解析完成文档ID: {doc_id}, 耗时: {process_duration:.2f}s, 块数: {chunk_count}")
return {"success": True, "chunk_count": chunk_count}
except Exception as e:
process_duration = time.time() - start_time
# error_message = f"解析失败: {str(e)}"
print(f"[Parser-ERROR] 文档 {doc_id} 解析失败: {e}")
logger.error(f"[Parser-ERROR] 文档 {doc_id} 解析失败: {e}")
error_message = f"解析失败: {e}"
traceback.print_exc() # 打印详细错误堆栈
traceback.logger.info_exc() # 打印详细错误堆栈
# 更新文档状态为失败
_update_document_progress(doc_id, status="1", run="0", message=error_message, process_duration=process_duration) # status=1表示完成run=0表示失败
return {"success": False, "error": error_message}
@ -707,4 +712,4 @@ def perform_parse(doc_id, doc_info, file_info, embedding_config, kb_info):
if temp_image_dir and os.path.exists(temp_image_dir):
shutil.rmtree(temp_image_dir, ignore_errors=True)
except Exception as clean_e:
print(f"[Parser-WARNING] 清理临时文件失败: {clean_e}")
logger.error(f"[Parser-WARNING] 清理临时文件失败: {clean_e}")

View File

@ -1,10 +1,11 @@
import logging
import copy
import datrie
import logging
import math
import os
import re
import string
import datrie
from hanziconv import HanziConv
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

View File

@ -1,15 +1,16 @@
import mysql.connector
import json
import threading
import requests
import traceback
import time
import traceback
from datetime import datetime
from utils import generate_uuid
import mysql.connector
import requests
from database import DB_CONFIG
from utils import generate_uuid
# 解析相关模块
from .document_parser import perform_parse, _update_document_progress
from .document_parser import _update_document_progress, perform_parse
# 用于存储进行中的顺序批量任务状态
# 结构: { kb_id: {"status": "running/completed/failed", "total": N, "current": M, "message": "...", "start_time": timestamp} }