Docx2Json_Json2Docx/查找分页符.py

41 lines
1.3 KiB
Python
Raw Permalink Normal View History

2025-07-02 16:17:54 +08:00
from docx import Document
from docx.enum.text import WD_BREAK
import docx.oxml.shared as oxml
def find_and_mark_page_breaks(input_path, output_path):
"""
功能检测文档中的分页符并在原位置添加标记
参数
input_path: 输入文档路径
output_path: 输出文档路径
"""
doc = Document(input_path)
# 遍历所有段落
for paragraph in doc.paragraphs:
# 遍历段落中的所有runs
for run in paragraph.runs:
# 检查run的XML中是否包含分页符
if has_page_break(run):
print(f"发现分页符 - 段落内容: '{paragraph.text}'")
# 在原位置添加可见标记(可选)
run.text = run.text.replace("\x0c", "[PAGE BREAK]")
# 如果要保留原分页符并添加新分页符
run.add_break(WD_BREAK.PAGE)
# 保存修改后的文档
doc.save(output_path)
print(f"处理完成,结果已保存到: {output_path}")
def has_page_break(run):
"""检查run是否包含分页符"""
xml = run._element.xml
return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text
# 使用示例
input_file = "source.docx"
output_file = "output_with_marks.docx"
find_and_mark_page_breaks(input_file, output_file)