第一次

This commit is contained in:
Voge1imkafig 2025-07-02 16:17:54 +08:00
commit 87a6cc9a78
9 changed files with 15197 additions and 0 deletions

Binary file not shown.

Binary file not shown.

396
docx_to_json.py Normal file
View File

@ -0,0 +1,396 @@
import json
from docx import Document
from docx.oxml.shared import qn
def docx_to_json(docx_path):
print(f"\n开始解析文档: {docx_path}")
doc = Document(docx_path)
result = []
para_index = 0
table_index = 0
print(f"文档包含 {len(doc.paragraphs)} 个段落和 {len(doc.tables)} 个表格")
for element in doc.element.body:
if element.tag.endswith('p'):
paragraph = doc.paragraphs[para_index]
print(f"\n处理段落 {para_index}: {paragraph.text[:50]}...")
para_data = process_paragraph(paragraph)
result.append({
"type": "text",
"content": para_data
})
para_index += 1
elif element.tag.endswith('tbl'):
table = doc.tables[table_index]
print(f"\n处理表格 {table_index} ({len(table.rows)}行×{len(table.columns)}列)")
table_data = process_table_with_merge_info(table)
result.append({
"type": "table",
"content": table_data
# 移除了bold属性加粗信息现在由每个run单独记录
})
table_index += 1
print("\n文档解析完成!")
return result
def process_table_with_merge_info(table):
"""处理表格并包含合并信息、行高列宽和完整格式"""
table_data = {
"rows": len(table.rows),
"cols": len(table.columns),
"cells": [],
"merged_cells": [],
"row_heights": [None] * len(table.rows),
"col_widths": [None] * len(table.columns)
}
# 先处理所有单元格内容
cell_data_matrix = []
for i, row in enumerate(table.rows):
row_data = []
for j, cell in enumerate(row.cells):
cell_data = {
"row": i,
"col": j,
"content": process_cell_content(cell),
"alignment": get_cell_alignment(cell), # 获取单元格对齐
"vertical_align": get_vertical_alignment(cell), # 新增垂直对齐
"border": get_cell_border(cell),
"shading": get_cell_shading(cell), # 新增背景色
"margins": get_cell_margins(cell), # 新增边距
"is_merged": False,
"merge_info": None
}
row_data.append(cell_data)
# 记录行高列宽(最后一个单元格时记录)
if j == len(row.cells) - 1 and row.height is not None:
table_data["row_heights"][i] = row.height.inches
if i == len(table.rows) - 1 and table.columns[j].width is not None:
table_data["col_widths"][j] = table.columns[j].width.inches
cell_data_matrix.append(row_data)
# 检测合并单元格
merge_ranges = []
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
# 跳过已经处理过的合并单元格
if any((i >= r1 and i <= r2 and j >= c1 and j <= c2) for (r1, r2, c1, c2) in merge_ranges):
continue
# 查找相同单元格地址的范围
r2 = i
c2 = j
# 水平方向查找
while c2 + 1 < table_data["cols"] and table.rows[i].cells[c2 + 1]._tc is cell._tc:
c2 += 1
# 垂直方向查找
while r2 + 1 < table_data["rows"] and table.rows[r2 + 1].cells[j]._tc is cell._tc:
r2 += 1
# 如果找到合并区域
if r2 > i or c2 > j:
merge_ranges.append((i, r2, j, c2))
# 更新主单元格信息
cell_data_matrix[i][j]["is_merged"] = True
cell_data_matrix[i][j]["merge_info"] = {
"is_primary": True,
"merge_range": f"{i},{j}-{r2},{c2}"
}
# 添加到合并单元格列表
table_data["merged_cells"].append({
"start_row": i,
"start_col": j,
"end_row": r2,
"end_col": c2,
"content": process_cell_content(cell)
})
# 更新被合并单元格信息
for r in range(i, r2 + 1):
for c in range(j, c2 + 1):
if r != i or c != j: # 跳过主单元格
cell_data_matrix[r][c]["is_merged"] = True
cell_data_matrix[r][c]["merge_info"] = {
"is_primary": False,
"merge_range": f"{i},{j}-{r2},{c2}"
}
# 将处理后的单元格数据添加到结果中
table_data["cells"] = cell_data_matrix
return table_data
def get_vertical_alignment(cell):
"""获取单元格垂直对齐方式"""
try:
tcPr = cell._element.tcPr
if tcPr is not None:
vAlign = tcPr.find(qn('w:vAlign'))
if vAlign is not None:
align_map = {
'top': 'top',
'center': 'center',
'bottom': 'bottom'
}
return align_map.get(vAlign.get(qn('w:val')), 'top')
except:
print("获取垂直对齐方式失败")
pass
return 'top' # 默认顶部对齐
def get_cell_shading(cell):
"""获取单元格背景色"""
try:
tcPr = cell._element.tcPr
if tcPr is not None:
shading = tcPr.find(qn('w:shd'))
if shading is not None:
color = shading.get(qn('w:fill'))
if color:
return {
'color': color,
'theme': shading.get(qn('w:themeColor'), '')
}
except:
pass
return None
def get_cell_margins(cell):
"""获取单元格边距"""
margins = {}
try:
tcPr = cell._element.tcPr
if tcPr is not None:
for side in ['top', 'left', 'bottom', 'right']:
margin = tcPr.find(qn(f'w:tcMar/w:{side}'))
if margin is not None:
margins[side] = {
'w': margin.get(qn('w:w')),
'type': margin.get(qn('w:type'))
}
except:
pass
return margins if margins else None
def process_cell_content(cell):
"""处理单元格内容,直接调用段落处理函数"""
cell_content = []
for para in cell.paragraphs:
# 复用段落处理逻辑
para_data = process_paragraph(para)
cell_content.append(para_data)
return cell_content
def has_page_break(run):
"""检查run是否包含分页符"""
xml = run._element.xml
return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text
def process_paragraph(paragraph):
para_data = {
"alignment": get_alignment_with_fallback(paragraph),
"runs": []
}
print(f"段落对齐方式: {para_data['alignment']}")
for run in paragraph.runs:
run_data = {
"text": run.text,
"font": get_font_info(run, paragraph),
"style": run.style.name if run.style else None,
"has_page_break": has_page_break(run)
}
para_data["runs"].append(run_data)
print(f"段落包含 {len(para_data['runs'])} 个文本运行(runs)")
return para_data
def get_alignment_with_fallback(paragraph):
# 首先尝试直接从段落属性获取
try:
if paragraph.alignment is not None:
alignment_map = {
0: "left",
1: "center",
2: "right",
3: "justify"
}
result = alignment_map.get(paragraph.alignment, "left")
print(f"从paragraph.alignment获取对齐方式: {result}")
return result
except:
# 如果段落alignment为None尝试从样式获取
try:
p_pr = paragraph.style.element.xpath('w:pPr')[0]
if p_pr.xpath('w:jc'):
jc_attr = p_pr.xpath('w:jc')[0].attrib
align_map = {
"left": "left",
"center": "center",
"right": "right",
"both": "justify",
"start": "left",
"end": "right"
}
result = align_map.get(jc_attr[qn('w:val')], "left")
print(f"从段落样式w:jc获取对齐方式: {result}")
return result
except Exception as e:
print(f"获取对齐方式失败: {str(e)}")
print("使用默认对齐方式(left)")
return "left"
def get_font_info(run, paragraph):
font = run.font
font_info = {
"name": None,
"size": None,
"bold": font.bold if font.bold is not None else False, # 默认为False
"italic": font.italic,
"underline": font.underline,
"color": get_color_info(run, paragraph)
}
# 处理字体大小
if font.size:
font_info["size"] = font.size.pt
else:
try:
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
if p_rpr.xpath('w:sz'):
sz_attr = p_rpr.xpath('w:sz')[0].attrib
font_info["size"] = int(sz_attr[qn('w:val')]) / 2 # 转换为pt
print(f"从段落样式获取字体大小: {font_info['size']}pt")
except Exception as e:
print(f"获取字体大小失败: {str(e)}")
font_info["size"] = 11 # 默认值
# 处理字体名称
if font.name:
font_info["name"] = font.name
print(f"从run.font获取字体: {font.name}")
else:
try:
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
if p_rpr.xpath('w:rFonts'):
try:
font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:eastAsia")]
print(f"从段落样式w:eastAsia获取字体: {font_info['name']}")
except:
font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:ascii")]
print(f"从段落样式w:ascii获取字体: {font_info['name']}")
except Exception as e:
print(f"获取字体失败: {str(e)}")
font_info["name"] = "Calibri" # 默认值
return font_info
def get_color_info(run, paragraph):
"""增强版颜色获取优先从run获取失败则从段落样式获取"""
color_info = None
# 1. 首先尝试从run.font获取颜色
try:
if run.font.color and hasattr(run.font.color, 'rgb'):
color = run.font.color
color_info = {
"r": (color.rgb >> 16) & 0xff,
"g": (color.rgb >> 8) & 0xff,
"b": color.rgb & 0xff
}
print(f"从run.font获取颜色: RGB({color_info['r']}, {color_info['g']}, {color_info['b']})")
except Exception as e:
print(f"从run.font获取颜色失败: {str(e)}")
# 2. 如果run颜色为空尝试从段落样式中获取
if color_info is None:
try:
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
if p_rpr.xpath('w:color'):
color_attr = p_rpr.xpath('w:color')[0].attrib
if 'w:val' in color_attr:
hex_color = color_attr[qn('w:val')]
if hex_color.startswith('FF'):
hex_color = hex_color[2:]
rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
color_info = {
"r": rgb[0],
"g": rgb[1],
"b": rgb[2]
}
print(f"从段落样式获取颜色: RGB{rgb}")
except Exception as e:
print(f"从段落样式获取颜色失败: {str(e)}")
return color_info
def get_cell_alignment(cell):
if cell.paragraphs:
return get_alignment_with_fallback(cell.paragraphs[0])
return "left"
def get_cell_border(cell):
# 默认返回实线边框
default_border = {
"top": {"style": "single", "size": 4, "color": "000000"},
"bottom": {"style": "single", "size": 4, "color": "000000"},
"left": {"style": "single", "size": 4, "color": "000000"},
"right": {"style": "single", "size": 4, "color": "000000"}
}
try:
# 尝试获取实际边框设置
tcPr = cell._element.tcPr
if tcPr is None:
return default_border
borders = {}
for side in ['top', 'bottom', 'left', 'right']:
border = tcPr.xpath(f'w:tcBorders/w:{side}')
if border:
border = border[0]
border_style = border.get(qn('w:val'), 'single')
border_size = border.get(qn('w:sz'), '4')
border_color = border.get(qn('w:color'), '000000')
borders[side] = {
"style": border_style,
"size": int(border_size),
"color": border_color
}
else:
borders[side] = default_border[side]
return borders
except Exception as e:
print(f"获取单元格边框失败: {str(e)}, 使用默认边框")
return default_border
def process_cell(cell):
cell_content = []
print(f"处理单元格,包含 {len(cell.paragraphs)} 个段落")
for para in cell.paragraphs:
cell_content.append(process_paragraph(para))
return cell_content
if __name__ == "__main__":
docx_path = r'D:\work\报告扫描\source.docx'
json_data = docx_to_json(docx_path)
with open("output.json", "w", encoding="utf-8") as f:
json.dump(json_data, f, ensure_ascii=False, indent=2)
print("转换完成结果已保存到output.json")

243
json_to_docx.py Normal file
View File

@ -0,0 +1,243 @@
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.shared import qn, OxmlElement
import json
def json_to_docx(json_data, output_path):
print(f"\n开始转换JSON到DOCX文档输出路径: {output_path}")
doc = Document()
total_elements = len(json_data)
print(f"文档包含 {total_elements} 个元素(段落和表格)")
for i, element in enumerate(json_data, 1):
print(f"\n处理元素 {i}/{total_elements}: ", end="")
if element["type"] == "text":
print(f"段落 (长度: {len(element['content']['runs'])}个runs)")
add_paragraph_from_json(doc, element["content"])
elif element["type"] == "table":
rows = element["content"]["rows"]
cols = element["content"]["cols"]
merges = len(element["content"].get("merged_cells", []))
print(f"表格 ({rows}行×{cols}列, 包含 {merges} 个合并单元格)")
add_table_from_json(doc, element["content"], element.get("bold", False))
print("\n正在保存文档...")
doc.save(output_path)
print(f"文档已成功保存到 {output_path}")
def add_paragraph_from_json(doc, para_json):
paragraph = doc.add_paragraph()
print(f" 添加段落 (对齐: {para_json['alignment']})")
# 设置段落对齐方式
alignment_map = {
"left": WD_ALIGN_PARAGRAPH.LEFT,
"center": WD_ALIGN_PARAGRAPH.CENTER,
"right": WD_ALIGN_PARAGRAPH.RIGHT,
"justify": WD_ALIGN_PARAGRAPH.JUSTIFY
}
paragraph.alignment = alignment_map.get(para_json["alignment"], WD_ALIGN_PARAGRAPH.LEFT)
# 添加文本运行(runs)
for run_idx, run_json in enumerate(para_json["runs"], 1):
run = paragraph.add_run(run_json["text"])
if run_json["has_page_break"]:
import docx
run.add_break(docx.enum.text.WD_BREAK.PAGE)
font = run.font
print(f" 添加run {run_idx}: '{run_json['text']}' "
f"(字体: {run_json['font']['name']}, 大小: {run_json['font']['size']}, "
f"加粗: {run_json['font']['bold']}, 斜体: {run_json['font']['italic']})")
# 设置字体样式
if run_json["font"]["name"]:
font.name = run_json["font"]["name"]
run.element.rPr.rFonts.set(qn('w:eastAsia'), run_json["font"]["name"])
if run_json["font"]["size"]:
font.size = Pt(run_json["font"]["size"])
font.bold = run_json["font"]["bold"]
font.italic = run_json["font"]["italic"]
font.underline = run_json["font"]["underline"]
# 设置字体颜色
if run_json["font"]["color"]:
color = run_json["font"]["color"]
font.color.rgb = RGBColor(color["r"], color["g"], color["b"])
print(f" 设置颜色: RGB({color['r']}, {color['g']}, {color['b']})")
def add_table_from_json(doc, table_json, bold=False):
print(f" 创建表格: {table_json['rows']}× {table_json['cols']}")
table = doc.add_table(rows=table_json["rows"], cols=table_json["cols"])
# 设置表格样式为无网格线(我们将自定义边框)
table.style = 'Table Grid'
# 设置列宽
if "col_widths" in table_json and any(table_json["col_widths"]):
print(" 设置列宽...")
for col_idx, width in enumerate(table_json["col_widths"]):
if width is not None:
# 将英寸转换为Twips1英寸=1440 Twips
twips_width = int(width * 1440)
for cell in table.columns[col_idx].cells:
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
tcW = tcPr.first_child_found_in("w:tcW")
if tcW is None:
tcW = OxmlElement('w:tcW')
tcPr.append(tcW)
tcW.set(qn('w:w'), str(twips_width))
tcW.set(qn('w:type'), 'dxa') # 使用绝对单位
# 设置行高
if "row_heights" in table_json and any(table_json["row_heights"]):
print(" 设置行高...")
for row_idx, height in enumerate(table_json["row_heights"]):
if height is not None:
# 将英寸转换为Twips1英寸=1440 Twips
twips_height = int(height * 1440)
tr = table.rows[row_idx]._tr
trPr = tr.get_or_add_trPr()
trHeight = OxmlElement('w:trHeight')
trHeight.set(qn('w:val'), str(twips_height))
trHeight.set(qn('w:hRule'), 'atLeast') # 或'exact'表示固定高度
trPr.append(trHeight)
# 处理合并单元格
for merge_idx, merge_info in enumerate(table_json.get("merged_cells", []), 1):
start_row = merge_info["start_row"]
start_col = merge_info["start_col"]
end_row = merge_info["end_row"]
end_col = merge_info["end_col"]
print(f" 合并单元格 #{merge_idx}: 从({start_row},{start_col})到({end_row},{end_col})")
start_cell = table.cell(start_row, start_col)
end_cell = table.cell(end_row, end_col)
start_cell.merge(end_cell)
# 填充表格内容
for row_idx, row_data in enumerate(table_json["cells"]):
for col_idx, cell_data in enumerate(row_data):
# 跳过被合并的非主单元格
if cell_data["is_merged"] and not cell_data["merge_info"]["is_primary"]:
print(f" 跳过被合并的单元格({row_idx},{col_idx})")
continue
cell = table.cell(cell_data["row"], cell_data["col"])
print(f" 处理单元格({row_idx},{col_idx}) - 对齐: {cell_data['alignment']}")
format_cell(cell, cell_data) # 统一设置单元格格式
def format_cell(cell, cell_data):
"""设置单元格完整格式"""
# 清空原有内容
for p in cell.paragraphs:
p._element.getparent().remove(p._element)
# 添加内容
for para in cell_data["content"]:
add_paragraph_from_json(cell, para)
# 设置对齐方式
set_cell_alignment(cell, cell_data)
# 设置边框
set_cell_border(cell, cell_data["border"])
# 设置背景色
if cell_data.get("shading"):
set_cell_shading(cell, cell_data["shading"])
# 设置边距
if cell_data.get("margins"):
set_cell_margins(cell, cell_data["margins"])
def set_cell_alignment(cell, cell_data):
"""设置单元格对齐(水平和垂直)"""
# 水平对齐
if cell.paragraphs:
align_map = {
"left": WD_ALIGN_PARAGRAPH.LEFT,
"center": WD_ALIGN_PARAGRAPH.CENTER,
"right": WD_ALIGN_PARAGRAPH.RIGHT,
"justify": WD_ALIGN_PARAGRAPH.JUSTIFY
}
cell.paragraphs[0].alignment = align_map.get(cell_data["alignment"], WD_ALIGN_PARAGRAPH.LEFT)
# 垂直对齐设置
tcPr = cell._tc.get_or_add_tcPr()
vAlign = OxmlElement('w:vAlign')
align_value = cell_data.get('vertical_align', 'top')
print(f" 设置垂直对齐: {align_value}")
# 确保使用有效的对齐值
valid_alignments = ['top', 'center', 'bottom']
if align_value not in valid_alignments:
align_value = 'top' # 默认值
vAlign.set(qn('w:val'), align_value)
tcPr.append(vAlign)
def set_cell_shading(cell, shading):
"""设置单元格背景色"""
tcPr = cell._tc.get_or_add_tcPr()
shd = OxmlElement('w:shd')
shd.set(qn('w:fill'), shading["color"])
if shading.get("theme"):
shd.set(qn('w:themeColor'), shading["theme"])
tcPr.append(shd)
def set_cell_margins(cell, margins):
"""设置单元格边距"""
tcPr = cell._tc.get_or_add_tcPr()
tcMar = OxmlElement('w:tcMar')
for side, margin in margins.items():
side_el = OxmlElement(f'w:{side}')
side_el.set(qn('w:w'), margin["w"])
side_el.set(qn('w:type'), margin["type"])
tcMar.append(side_el)
tcPr.append(tcMar)
def set_cell_border(cell, border_data):
"""
设置单元格边框
:param cell: 单元格对象
:param border_data: 边框数据
"""
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
# 检查是否存在边框元素,不存在则创建
tcBorders = tcPr.first_child_found_in("w:tcBorders")
if tcBorders is None:
tcBorders = OxmlElement('w:tcBorders')
tcPr.append(tcBorders)
# 设置各边边框
for side in ['top', 'left', 'bottom', 'right']:
if side in border_data:
border = border_data[side]
border_el = OxmlElement(f'w:{side}')
border_el.set(qn('w:val'), border.get('style', 'single'))
border_el.set(qn('w:sz'), str(border.get('size', 4)))
border_el.set(qn('w:color'), border.get('color', '000000'))
tcBorders.append(border_el)
# 使用示例
if __name__ == "__main__":
# 假设我们已经有了之前生成的JSON数据
input_json = "output.json"
output_path = "restored.docx"
print(f"{input_json} 读取JSON数据...")
with open(input_json, "r", encoding="utf-8") as f:
json_data = json.load(f)
# 将JSON转换回DOCX
json_to_docx(json_data, output_path)

14517
output.json Normal file

File diff suppressed because it is too large Load Diff

BIN
restored.docx Normal file

Binary file not shown.

BIN
source.docx Normal file

Binary file not shown.

41
查找分页符.py Normal file
View File

@ -0,0 +1,41 @@
from docx import Document
from docx.enum.text import WD_BREAK
import docx.oxml.shared as oxml
def find_and_mark_page_breaks(input_path, output_path):
"""
功能检测文档中的分页符并在原位置添加标记
参数
input_path: 输入文档路径
output_path: 输出文档路径
"""
doc = Document(input_path)
# 遍历所有段落
for paragraph in doc.paragraphs:
# 遍历段落中的所有runs
for run in paragraph.runs:
# 检查run的XML中是否包含分页符
if has_page_break(run):
print(f"发现分页符 - 段落内容: '{paragraph.text}'")
# 在原位置添加可见标记(可选)
run.text = run.text.replace("\x0c", "[PAGE BREAK]")
# 如果要保留原分页符并添加新分页符
run.add_break(WD_BREAK.PAGE)
# 保存修改后的文档
doc.save(output_path)
print(f"处理完成,结果已保存到: {output_path}")
def has_page_break(run):
"""检查run是否包含分页符"""
xml = run._element.xml
return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text
# 使用示例
input_file = "source.docx"
output_file = "output_with_marks.docx"
find_and_mark_page_breaks(input_file, output_file)