141 lines
4.2 KiB
Python
141 lines
4.2 KiB
Python
import os
|
||
from io import BytesIO
|
||
|
||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
||
from magic_pdf.data.dataset import PymuDocDataset
|
||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||
from magic_pdf.config.enums import SupportedPdfParseMethod
|
||
|
||
def process_pdf_with_magic(file_content, callback=None):
|
||
"""
|
||
使用magic_pdf处理PDF文件
|
||
|
||
Args:
|
||
file_content: PDF文件内容
|
||
callback: 回调函数,用于更新进度
|
||
|
||
Returns:
|
||
解析后的内容列表
|
||
"""
|
||
try:
|
||
from magic_pdf.processor import PDFProcessor
|
||
from magic_pdf.extractor import TextExtractor, ImageExtractor
|
||
|
||
if callback:
|
||
callback(0.1, "初始化Magic PDF解析器")
|
||
|
||
# 创建临时文件
|
||
temp_dir = os.path.join(os.getcwd(), "temp")
|
||
os.makedirs(temp_dir, exist_ok=True)
|
||
|
||
temp_pdf_path = os.path.join(temp_dir, "temp.pdf")
|
||
with open(temp_pdf_path, "wb") as f:
|
||
f.write(file_content)
|
||
|
||
if callback:
|
||
callback(0.2, "开始解析PDF")
|
||
|
||
# 初始化处理器
|
||
processor = PDFProcessor(temp_pdf_path)
|
||
|
||
if callback:
|
||
callback(0.3, "提取文本内容")
|
||
|
||
# 提取文本
|
||
text_extractor = TextExtractor(processor)
|
||
text_content = text_extractor.extract()
|
||
|
||
if callback:
|
||
callback(0.5, "提取图片内容")
|
||
|
||
# 提取图片
|
||
image_extractor = ImageExtractor(processor)
|
||
images = image_extractor.extract()
|
||
|
||
if callback:
|
||
callback(0.7, "组织解析结果")
|
||
|
||
# 组织结果
|
||
content_list = []
|
||
|
||
# 添加文本内容
|
||
for page_num, page_text in enumerate(text_content):
|
||
content_list.append({
|
||
"type": "text",
|
||
"page": page_num + 1,
|
||
"text": page_text
|
||
})
|
||
|
||
# 添加图片内容
|
||
for i, img in enumerate(images):
|
||
content_list.append({
|
||
"type": "image",
|
||
"page": img.get("page", i + 1),
|
||
"image_path": img.get("path", ""),
|
||
"caption": img.get("caption", "")
|
||
})
|
||
|
||
# 清理临时文件
|
||
try:
|
||
os.remove(temp_pdf_path)
|
||
except:
|
||
pass
|
||
|
||
if callback:
|
||
callback(1.0, "PDF解析完成")
|
||
|
||
return content_list
|
||
|
||
except ImportError:
|
||
# 如果magic_pdf未安装,使用简单的文本提取
|
||
if callback:
|
||
callback(0.2, "Magic PDF未安装,使用备用方法")
|
||
|
||
try:
|
||
import PyPDF2
|
||
|
||
if callback:
|
||
callback(0.3, "使用PyPDF2提取文本")
|
||
|
||
pdf_reader = PyPDF2.PdfReader(BytesIO(file_content))
|
||
content_list = []
|
||
|
||
for i, page in enumerate(pdf_reader.pages):
|
||
if callback and i % 5 == 0:
|
||
progress = 0.3 + (i / len(pdf_reader.pages)) * 0.6
|
||
callback(progress, f"正在处理第 {i+1}/{len(pdf_reader.pages)} 页")
|
||
|
||
text = page.extract_text()
|
||
if text:
|
||
content_list.append({
|
||
"type": "text",
|
||
"page": i + 1,
|
||
"text": text
|
||
})
|
||
|
||
if callback:
|
||
callback(0.9, "文本提取完成")
|
||
|
||
return content_list
|
||
|
||
except Exception as e:
|
||
if callback:
|
||
callback(0.5, f"PDF解析失败: {str(e)}")
|
||
|
||
# 最简单的备用方案
|
||
return [{
|
||
"type": "text",
|
||
"page": 1,
|
||
"text": "无法解析PDF文件内容"
|
||
}]
|
||
|
||
except Exception as e:
|
||
if callback:
|
||
callback(0.5, f"PDF解析失败: {str(e)}")
|
||
|
||
# 出错时返回空列表
|
||
return [{
|
||
"type": "text",
|
||
"page": 1,
|
||
"text": f"解析失败: {str(e)}"
|
||
}] |