RAGflow/management/server/magic_pdf_parser.py

141 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from io import BytesIO
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
def process_pdf_with_magic(file_content, callback=None):
"""
使用magic_pdf处理PDF文件
Args:
file_content: PDF文件内容
callback: 回调函数,用于更新进度
Returns:
解析后的内容列表
"""
try:
from magic_pdf.processor import PDFProcessor
from magic_pdf.extractor import TextExtractor, ImageExtractor
if callback:
callback(0.1, "初始化Magic PDF解析器")
# 创建临时文件
temp_dir = os.path.join(os.getcwd(), "temp")
os.makedirs(temp_dir, exist_ok=True)
temp_pdf_path = os.path.join(temp_dir, "temp.pdf")
with open(temp_pdf_path, "wb") as f:
f.write(file_content)
if callback:
callback(0.2, "开始解析PDF")
# 初始化处理器
processor = PDFProcessor(temp_pdf_path)
if callback:
callback(0.3, "提取文本内容")
# 提取文本
text_extractor = TextExtractor(processor)
text_content = text_extractor.extract()
if callback:
callback(0.5, "提取图片内容")
# 提取图片
image_extractor = ImageExtractor(processor)
images = image_extractor.extract()
if callback:
callback(0.7, "组织解析结果")
# 组织结果
content_list = []
# 添加文本内容
for page_num, page_text in enumerate(text_content):
content_list.append({
"type": "text",
"page": page_num + 1,
"text": page_text
})
# 添加图片内容
for i, img in enumerate(images):
content_list.append({
"type": "image",
"page": img.get("page", i + 1),
"image_path": img.get("path", ""),
"caption": img.get("caption", "")
})
# 清理临时文件
try:
os.remove(temp_pdf_path)
except:
pass
if callback:
callback(1.0, "PDF解析完成")
return content_list
except ImportError:
# 如果magic_pdf未安装使用简单的文本提取
if callback:
callback(0.2, "Magic PDF未安装使用备用方法")
try:
import PyPDF2
if callback:
callback(0.3, "使用PyPDF2提取文本")
pdf_reader = PyPDF2.PdfReader(BytesIO(file_content))
content_list = []
for i, page in enumerate(pdf_reader.pages):
if callback and i % 5 == 0:
progress = 0.3 + (i / len(pdf_reader.pages)) * 0.6
callback(progress, f"正在处理第 {i+1}/{len(pdf_reader.pages)}")
text = page.extract_text()
if text:
content_list.append({
"type": "text",
"page": i + 1,
"text": text
})
if callback:
callback(0.9, "文本提取完成")
return content_list
except Exception as e:
if callback:
callback(0.5, f"PDF解析失败: {str(e)}")
# 最简单的备用方案
return [{
"type": "text",
"page": 1,
"text": "无法解析PDF文件内容"
}]
except Exception as e:
if callback:
callback(0.5, f"PDF解析失败: {str(e)}")
# 出错时返回空列表
return [{
"type": "text",
"page": 1,
"text": f"解析失败: {str(e)}"
}]