RAGflow/management/server/magic_pdf_parser.py

141 lines
4.2 KiB
Python
Raw Normal View History

import os
from io import BytesIO
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
def process_pdf_with_magic(file_content, callback=None):
"""
使用magic_pdf处理PDF文件
Args:
file_content: PDF文件内容
callback: 回调函数用于更新进度
Returns:
解析后的内容列表
"""
try:
from magic_pdf.processor import PDFProcessor
from magic_pdf.extractor import TextExtractor, ImageExtractor
if callback:
callback(0.1, "初始化Magic PDF解析器")
# 创建临时文件
temp_dir = os.path.join(os.getcwd(), "temp")
os.makedirs(temp_dir, exist_ok=True)
temp_pdf_path = os.path.join(temp_dir, "temp.pdf")
with open(temp_pdf_path, "wb") as f:
f.write(file_content)
if callback:
callback(0.2, "开始解析PDF")
# 初始化处理器
processor = PDFProcessor(temp_pdf_path)
if callback:
callback(0.3, "提取文本内容")
# 提取文本
text_extractor = TextExtractor(processor)
text_content = text_extractor.extract()
if callback:
callback(0.5, "提取图片内容")
# 提取图片
image_extractor = ImageExtractor(processor)
images = image_extractor.extract()
if callback:
callback(0.7, "组织解析结果")
# 组织结果
content_list = []
# 添加文本内容
for page_num, page_text in enumerate(text_content):
content_list.append({
"type": "text",
"page": page_num + 1,
"text": page_text
})
# 添加图片内容
for i, img in enumerate(images):
content_list.append({
"type": "image",
"page": img.get("page", i + 1),
"image_path": img.get("path", ""),
"caption": img.get("caption", "")
})
# 清理临时文件
try:
os.remove(temp_pdf_path)
except:
pass
if callback:
callback(1.0, "PDF解析完成")
return content_list
except ImportError:
# 如果magic_pdf未安装使用简单的文本提取
if callback:
callback(0.2, "Magic PDF未安装使用备用方法")
try:
import PyPDF2
if callback:
callback(0.3, "使用PyPDF2提取文本")
pdf_reader = PyPDF2.PdfReader(BytesIO(file_content))
content_list = []
for i, page in enumerate(pdf_reader.pages):
if callback and i % 5 == 0:
progress = 0.3 + (i / len(pdf_reader.pages)) * 0.6
callback(progress, f"正在处理第 {i+1}/{len(pdf_reader.pages)}")
text = page.extract_text()
if text:
content_list.append({
"type": "text",
"page": i + 1,
"text": text
})
if callback:
callback(0.9, "文本提取完成")
return content_list
except Exception as e:
if callback:
callback(0.5, f"PDF解析失败: {str(e)}")
# 最简单的备用方案
return [{
"type": "text",
"page": 1,
"text": "无法解析PDF文件内容"
}]
except Exception as e:
if callback:
callback(0.5, f"PDF解析失败: {str(e)}")
# 出错时返回空列表
return [{
"type": "text",
"page": 1,
"text": f"解析失败: {str(e)}"
}]