Docx2Json_Json2Docx/查找分页符.py

from docx import Document
from docx.enum.text import WD_BREAK
import docx.oxml.shared as oxml

def find_and_mark_page_breaks(input_path, output_path):
    """
    功能：检测文档中的分页符并在原位置添加标记
    
    参数：
        input_path: 输入文档路径
        output_path: 输出文档路径
    """
    doc = Document(input_path)
    
    # 遍历所有段落
    for paragraph in doc.paragraphs:
        # 遍历段落中的所有runs
        for run in paragraph.runs:
            # 检查run的XML中是否包含分页符
            if has_page_break(run):
                print(f"发现分页符 - 段落内容: '{paragraph.text}'")
                
                # 在原位置添加可见标记（可选）
                run.text = run.text.replace("\x0c", "[PAGE BREAK]")
                
                # 如果要保留原分页符并添加新分页符
                run.add_break(WD_BREAK.PAGE)
    
    # 保存修改后的文档
    doc.save(output_path)
    print(f"处理完成，结果已保存到: {output_path}")

def has_page_break(run):
    """检查run是否包含分页符"""
    xml = run._element.xml
    return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text

# 使用示例
input_file = "source.docx"
output_file = "output_with_marks.docx"
find_and_mark_page_breaks(input_file, output_file)