41 lines
1.3 KiB
Python
41 lines
1.3 KiB
Python
from docx import Document
|
|
from docx.enum.text import WD_BREAK
|
|
import docx.oxml.shared as oxml
|
|
|
|
def find_and_mark_page_breaks(input_path, output_path):
|
|
"""
|
|
功能:检测文档中的分页符并在原位置添加标记
|
|
|
|
参数:
|
|
input_path: 输入文档路径
|
|
output_path: 输出文档路径
|
|
"""
|
|
doc = Document(input_path)
|
|
|
|
# 遍历所有段落
|
|
for paragraph in doc.paragraphs:
|
|
# 遍历段落中的所有runs
|
|
for run in paragraph.runs:
|
|
# 检查run的XML中是否包含分页符
|
|
if has_page_break(run):
|
|
print(f"发现分页符 - 段落内容: '{paragraph.text}'")
|
|
|
|
# 在原位置添加可见标记(可选)
|
|
run.text = run.text.replace("\x0c", "[PAGE BREAK]")
|
|
|
|
# 如果要保留原分页符并添加新分页符
|
|
run.add_break(WD_BREAK.PAGE)
|
|
|
|
# 保存修改后的文档
|
|
doc.save(output_path)
|
|
print(f"处理完成,结果已保存到: {output_path}")
|
|
|
|
def has_page_break(run):
|
|
"""检查run是否包含分页符"""
|
|
xml = run._element.xml
|
|
return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text
|
|
|
|
# 使用示例
|
|
input_file = "source.docx"
|
|
output_file = "output_with_marks.docx"
|
|
find_and_mark_page_breaks(input_file, output_file) |