Report_Generator/json2docx_docx2json/docx_to_json.py

397 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
from docx import Document
from docx.oxml.shared import qn
def docx_to_json(docx_path):
print(f"\n开始解析文档: {docx_path}")
doc = Document(docx_path)
result = []
para_index = 0
table_index = 0
print(f"文档包含 {len(doc.paragraphs)} 个段落和 {len(doc.tables)} 个表格")
for element in doc.element.body:
if element.tag.endswith('p'):
paragraph = doc.paragraphs[para_index]
print(f"\n处理段落 {para_index}: {paragraph.text[:50]}...")
para_data = process_paragraph(paragraph)
result.append({
"type": "text",
"content": para_data
})
para_index += 1
elif element.tag.endswith('tbl'):
table = doc.tables[table_index]
print(f"\n处理表格 {table_index} ({len(table.rows)}行×{len(table.columns)}列)")
table_data = process_table_with_merge_info(table)
result.append({
"type": "table",
"content": table_data
# 移除了bold属性加粗信息现在由每个run单独记录
})
table_index += 1
print("\n文档解析完成!")
return result
def process_table_with_merge_info(table):
"""处理表格并包含合并信息、行高列宽和完整格式"""
table_data = {
"rows": len(table.rows),
"cols": len(table.columns),
"cells": [],
"merged_cells": [],
"row_heights": [None] * len(table.rows),
"col_widths": [None] * len(table.columns)
}
# 先处理所有单元格内容
cell_data_matrix = []
for i, row in enumerate(table.rows):
row_data = []
for j, cell in enumerate(row.cells):
cell_data = {
"row": i,
"col": j,
"content": process_cell_content(cell),
"alignment": get_cell_alignment(cell), # 获取单元格对齐
"vertical_align": get_vertical_alignment(cell), # 新增垂直对齐
"border": get_cell_border(cell),
"shading": get_cell_shading(cell), # 新增背景色
"margins": get_cell_margins(cell), # 新增边距
"is_merged": False,
"merge_info": None
}
row_data.append(cell_data)
# 记录行高列宽(最后一个单元格时记录)
if j == len(row.cells) - 1 and row.height is not None:
table_data["row_heights"][i] = row.height.inches
if i == len(table.rows) - 1 and table.columns[j].width is not None:
table_data["col_widths"][j] = table.columns[j].width.inches
cell_data_matrix.append(row_data)
# 检测合并单元格
merge_ranges = []
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
# 跳过已经处理过的合并单元格
if any((i >= r1 and i <= r2 and j >= c1 and j <= c2) for (r1, r2, c1, c2) in merge_ranges):
continue
# 查找相同单元格地址的范围
r2 = i
c2 = j
# 水平方向查找
while c2 + 1 < table_data["cols"] and table.rows[i].cells[c2 + 1]._tc is cell._tc:
c2 += 1
# 垂直方向查找
while r2 + 1 < table_data["rows"] and table.rows[r2 + 1].cells[j]._tc is cell._tc:
r2 += 1
# 如果找到合并区域
if r2 > i or c2 > j:
merge_ranges.append((i, r2, j, c2))
# 更新主单元格信息
cell_data_matrix[i][j]["is_merged"] = True
cell_data_matrix[i][j]["merge_info"] = {
"is_primary": True,
"merge_range": f"{i},{j}-{r2},{c2}"
}
# 添加到合并单元格列表
table_data["merged_cells"].append({
"start_row": i,
"start_col": j,
"end_row": r2,
"end_col": c2,
"content": process_cell_content(cell)
})
# 更新被合并单元格信息
for r in range(i, r2 + 1):
for c in range(j, c2 + 1):
if r != i or c != j: # 跳过主单元格
cell_data_matrix[r][c]["is_merged"] = True
cell_data_matrix[r][c]["merge_info"] = {
"is_primary": False,
"merge_range": f"{i},{j}-{r2},{c2}"
}
# 将处理后的单元格数据添加到结果中
table_data["cells"] = cell_data_matrix
return table_data
def get_vertical_alignment(cell):
"""获取单元格垂直对齐方式"""
try:
tcPr = cell._element.tcPr
if tcPr is not None:
vAlign = tcPr.find(qn('w:vAlign'))
if vAlign is not None:
align_map = {
'top': 'top',
'center': 'center',
'bottom': 'bottom'
}
return align_map.get(vAlign.get(qn('w:val')), 'top')
except:
print("获取垂直对齐方式失败")
pass
return 'top' # 默认顶部对齐
def get_cell_shading(cell):
"""获取单元格背景色"""
try:
tcPr = cell._element.tcPr
if tcPr is not None:
shading = tcPr.find(qn('w:shd'))
if shading is not None:
color = shading.get(qn('w:fill'))
if color:
return {
'color': color,
'theme': shading.get(qn('w:themeColor'), '')
}
except:
pass
return None
def get_cell_margins(cell):
"""获取单元格边距"""
margins = {}
try:
tcPr = cell._element.tcPr
if tcPr is not None:
for side in ['top', 'left', 'bottom', 'right']:
margin = tcPr.find(qn(f'w:tcMar/w:{side}'))
if margin is not None:
margins[side] = {
'w': margin.get(qn('w:w')),
'type': margin.get(qn('w:type'))
}
except:
pass
return margins if margins else None
def process_cell_content(cell):
"""处理单元格内容,直接调用段落处理函数"""
cell_content = []
for para in cell.paragraphs:
# 复用段落处理逻辑
para_data = process_paragraph(para)
cell_content.append(para_data)
return cell_content
def has_page_break(run):
"""检查run是否包含分页符"""
xml = run._element.xml
return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text
def process_paragraph(paragraph):
para_data = {
"alignment": get_alignment_with_fallback(paragraph),
"runs": []
}
print(f"段落对齐方式: {para_data['alignment']}")
for run in paragraph.runs:
run_data = {
"text": run.text,
"font": get_font_info(run, paragraph),
"style": run.style.name if run.style else None,
"has_page_break": has_page_break(run)
}
para_data["runs"].append(run_data)
print(f"段落包含 {len(para_data['runs'])} 个文本运行(runs)")
return para_data
def get_alignment_with_fallback(paragraph):
# 首先尝试直接从段落属性获取
try:
if paragraph.alignment is not None:
alignment_map = {
0: "left",
1: "center",
2: "right",
3: "justify"
}
result = alignment_map.get(paragraph.alignment, "left")
print(f"从paragraph.alignment获取对齐方式: {result}")
return result
except:
# 如果段落alignment为None尝试从样式获取
try:
p_pr = paragraph.style.element.xpath('w:pPr')[0]
if p_pr.xpath('w:jc'):
jc_attr = p_pr.xpath('w:jc')[0].attrib
align_map = {
"left": "left",
"center": "center",
"right": "right",
"both": "justify",
"start": "left",
"end": "right"
}
result = align_map.get(jc_attr[qn('w:val')], "left")
print(f"从段落样式w:jc获取对齐方式: {result}")
return result
except Exception as e:
print(f"获取对齐方式失败: {str(e)}")
print("使用默认对齐方式(left)")
return "left"
def get_font_info(run, paragraph):
font = run.font
font_info = {
"name": None,
"size": None,
"bold": font.bold if font.bold is not None else False, # 默认为False
"italic": font.italic,
"underline": font.underline,
"color": get_color_info(run, paragraph)
}
# 处理字体大小
if font.size:
font_info["size"] = font.size.pt
else:
try:
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
if p_rpr.xpath('w:sz'):
sz_attr = p_rpr.xpath('w:sz')[0].attrib
font_info["size"] = int(sz_attr[qn('w:val')]) / 2 # 转换为pt
print(f"从段落样式获取字体大小: {font_info['size']}pt")
except Exception as e:
print(f"获取字体大小失败: {str(e)}")
font_info["size"] = 11 # 默认值
# 处理字体名称
if font.name:
font_info["name"] = font.name
print(f"从run.font获取字体: {font.name}")
else:
try:
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
if p_rpr.xpath('w:rFonts'):
try:
font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:eastAsia")]
print(f"从段落样式w:eastAsia获取字体: {font_info['name']}")
except:
font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:ascii")]
print(f"从段落样式w:ascii获取字体: {font_info['name']}")
except Exception as e:
print(f"获取字体失败: {str(e)}")
font_info["name"] = "Calibri" # 默认值
return font_info
def get_color_info(run, paragraph):
"""增强版颜色获取优先从run获取失败则从段落样式获取"""
color_info = None
# 1. 首先尝试从run.font获取颜色
try:
if run.font.color and hasattr(run.font.color, 'rgb'):
color = run.font.color
color_info = {
"r": (color.rgb >> 16) & 0xff,
"g": (color.rgb >> 8) & 0xff,
"b": color.rgb & 0xff
}
print(f"从run.font获取颜色: RGB({color_info['r']}, {color_info['g']}, {color_info['b']})")
except Exception as e:
print(f"从run.font获取颜色失败: {str(e)}")
# 2. 如果run颜色为空尝试从段落样式中获取
if color_info is None:
try:
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
if p_rpr.xpath('w:color'):
color_attr = p_rpr.xpath('w:color')[0].attrib
if 'w:val' in color_attr:
hex_color = color_attr[qn('w:val')]
if hex_color.startswith('FF'):
hex_color = hex_color[2:]
rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
color_info = {
"r": rgb[0],
"g": rgb[1],
"b": rgb[2]
}
print(f"从段落样式获取颜色: RGB{rgb}")
except Exception as e:
print(f"从段落样式获取颜色失败: {str(e)}")
return color_info
def get_cell_alignment(cell):
if cell.paragraphs:
return get_alignment_with_fallback(cell.paragraphs[0])
return "left"
def get_cell_border(cell):
# 默认返回实线边框
default_border = {
"top": {"style": "single", "size": 4, "color": "000000"},
"bottom": {"style": "single", "size": 4, "color": "000000"},
"left": {"style": "single", "size": 4, "color": "000000"},
"right": {"style": "single", "size": 4, "color": "000000"}
}
try:
# 尝试获取实际边框设置
tcPr = cell._element.tcPr
if tcPr is None:
return default_border
borders = {}
for side in ['top', 'bottom', 'left', 'right']:
border = tcPr.xpath(f'w:tcBorders/w:{side}')
if border:
border = border[0]
border_style = border.get(qn('w:val'), 'single')
border_size = border.get(qn('w:sz'), '4')
border_color = border.get(qn('w:color'), '000000')
borders[side] = {
"style": border_style,
"size": int(border_size),
"color": border_color
}
else:
borders[side] = default_border[side]
return borders
except Exception as e:
print(f"获取单元格边框失败: {str(e)}, 使用默认边框")
return default_border
def process_cell(cell):
cell_content = []
print(f"处理单元格,包含 {len(cell.paragraphs)} 个段落")
for para in cell.paragraphs:
cell_content.append(process_paragraph(para))
return cell_content
if __name__ == "__main__":
docx_path = r'D:\work\报告扫描\source.docx'
json_data = docx_to_json(docx_path)
with open("output.json", "w", encoding="utf-8") as f:
json.dump(json_data, f, ensure_ascii=False, indent=2)
print("转换完成结果已保存到output.json")