Docx2Json_Json2Docx/docx_to_json.py

397 lines
14 KiB
Python
Raw Normal View History

2025-07-02 16:17:54 +08:00
import json
from docx import Document
from docx.oxml.shared import qn
def docx_to_json(docx_path):
print(f"\n开始解析文档: {docx_path}")
doc = Document(docx_path)
result = []
para_index = 0
table_index = 0
print(f"文档包含 {len(doc.paragraphs)} 个段落和 {len(doc.tables)} 个表格")
for element in doc.element.body:
if element.tag.endswith('p'):
paragraph = doc.paragraphs[para_index]
print(f"\n处理段落 {para_index}: {paragraph.text[:50]}...")
para_data = process_paragraph(paragraph)
result.append({
"type": "text",
"content": para_data
})
para_index += 1
elif element.tag.endswith('tbl'):
table = doc.tables[table_index]
print(f"\n处理表格 {table_index} ({len(table.rows)}行×{len(table.columns)}列)")
table_data = process_table_with_merge_info(table)
result.append({
"type": "table",
"content": table_data
# 移除了bold属性加粗信息现在由每个run单独记录
})
table_index += 1
print("\n文档解析完成!")
return result
def process_table_with_merge_info(table):
"""处理表格并包含合并信息、行高列宽和完整格式"""
table_data = {
"rows": len(table.rows),
"cols": len(table.columns),
"cells": [],
"merged_cells": [],
"row_heights": [None] * len(table.rows),
"col_widths": [None] * len(table.columns)
}
# 先处理所有单元格内容
cell_data_matrix = []
for i, row in enumerate(table.rows):
row_data = []
for j, cell in enumerate(row.cells):
cell_data = {
"row": i,
"col": j,
"content": process_cell_content(cell),
"alignment": get_cell_alignment(cell), # 获取单元格对齐
"vertical_align": get_vertical_alignment(cell), # 新增垂直对齐
"border": get_cell_border(cell),
"shading": get_cell_shading(cell), # 新增背景色
"margins": get_cell_margins(cell), # 新增边距
"is_merged": False,
"merge_info": None
}
row_data.append(cell_data)
# 记录行高列宽(最后一个单元格时记录)
if j == len(row.cells) - 1 and row.height is not None:
table_data["row_heights"][i] = row.height.inches
if i == len(table.rows) - 1 and table.columns[j].width is not None:
table_data["col_widths"][j] = table.columns[j].width.inches
cell_data_matrix.append(row_data)
# 检测合并单元格
merge_ranges = []
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
# 跳过已经处理过的合并单元格
if any((i >= r1 and i <= r2 and j >= c1 and j <= c2) for (r1, r2, c1, c2) in merge_ranges):
continue
# 查找相同单元格地址的范围
r2 = i
c2 = j
# 水平方向查找
while c2 + 1 < table_data["cols"] and table.rows[i].cells[c2 + 1]._tc is cell._tc:
c2 += 1
# 垂直方向查找
while r2 + 1 < table_data["rows"] and table.rows[r2 + 1].cells[j]._tc is cell._tc:
r2 += 1
# 如果找到合并区域
if r2 > i or c2 > j:
merge_ranges.append((i, r2, j, c2))
# 更新主单元格信息
cell_data_matrix[i][j]["is_merged"] = True
cell_data_matrix[i][j]["merge_info"] = {
"is_primary": True,
"merge_range": f"{i},{j}-{r2},{c2}"
}
# 添加到合并单元格列表
table_data["merged_cells"].append({
"start_row": i,
"start_col": j,
"end_row": r2,
"end_col": c2,
"content": process_cell_content(cell)
})
# 更新被合并单元格信息
for r in range(i, r2 + 1):
for c in range(j, c2 + 1):
if r != i or c != j: # 跳过主单元格
cell_data_matrix[r][c]["is_merged"] = True
cell_data_matrix[r][c]["merge_info"] = {
"is_primary": False,
"merge_range": f"{i},{j}-{r2},{c2}"
}
# 将处理后的单元格数据添加到结果中
table_data["cells"] = cell_data_matrix
return table_data
def get_vertical_alignment(cell):
"""获取单元格垂直对齐方式"""
try:
tcPr = cell._element.tcPr
if tcPr is not None:
vAlign = tcPr.find(qn('w:vAlign'))
if vAlign is not None:
align_map = {
'top': 'top',
'center': 'center',
'bottom': 'bottom'
}
return align_map.get(vAlign.get(qn('w:val')), 'top')
except:
print("获取垂直对齐方式失败")
pass
return 'top' # 默认顶部对齐
def get_cell_shading(cell):
"""获取单元格背景色"""
try:
tcPr = cell._element.tcPr
if tcPr is not None:
shading = tcPr.find(qn('w:shd'))
if shading is not None:
color = shading.get(qn('w:fill'))
if color:
return {
'color': color,
'theme': shading.get(qn('w:themeColor'), '')
}
except:
pass
return None
def get_cell_margins(cell):
"""获取单元格边距"""
margins = {}
try:
tcPr = cell._element.tcPr
if tcPr is not None:
for side in ['top', 'left', 'bottom', 'right']:
margin = tcPr.find(qn(f'w:tcMar/w:{side}'))
if margin is not None:
margins[side] = {
'w': margin.get(qn('w:w')),
'type': margin.get(qn('w:type'))
}
except:
pass
return margins if margins else None
def process_cell_content(cell):
"""处理单元格内容,直接调用段落处理函数"""
cell_content = []
for para in cell.paragraphs:
# 复用段落处理逻辑
para_data = process_paragraph(para)
cell_content.append(para_data)
return cell_content
def has_page_break(run):
"""检查run是否包含分页符"""
xml = run._element.xml
return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text
def process_paragraph(paragraph):
para_data = {
"alignment": get_alignment_with_fallback(paragraph),
"runs": []
}
print(f"段落对齐方式: {para_data['alignment']}")
for run in paragraph.runs:
run_data = {
"text": run.text,
"font": get_font_info(run, paragraph),
"style": run.style.name if run.style else None,
"has_page_break": has_page_break(run)
}
para_data["runs"].append(run_data)
print(f"段落包含 {len(para_data['runs'])} 个文本运行(runs)")
return para_data
def get_alignment_with_fallback(paragraph):
# 首先尝试直接从段落属性获取
try:
if paragraph.alignment is not None:
alignment_map = {
0: "left",
1: "center",
2: "right",
3: "justify"
}
result = alignment_map.get(paragraph.alignment, "left")
print(f"从paragraph.alignment获取对齐方式: {result}")
return result
except:
# 如果段落alignment为None尝试从样式获取
try:
p_pr = paragraph.style.element.xpath('w:pPr')[0]
if p_pr.xpath('w:jc'):
jc_attr = p_pr.xpath('w:jc')[0].attrib
align_map = {
"left": "left",
"center": "center",
"right": "right",
"both": "justify",
"start": "left",
"end": "right"
}
result = align_map.get(jc_attr[qn('w:val')], "left")
print(f"从段落样式w:jc获取对齐方式: {result}")
return result
except Exception as e:
print(f"获取对齐方式失败: {str(e)}")
print("使用默认对齐方式(left)")
return "left"
def get_font_info(run, paragraph):
font = run.font
font_info = {
"name": None,
"size": None,
"bold": font.bold if font.bold is not None else False, # 默认为False
"italic": font.italic,
"underline": font.underline,
"color": get_color_info(run, paragraph)
}
# 处理字体大小
if font.size:
font_info["size"] = font.size.pt
else:
try:
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
if p_rpr.xpath('w:sz'):
sz_attr = p_rpr.xpath('w:sz')[0].attrib
font_info["size"] = int(sz_attr[qn('w:val')]) / 2 # 转换为pt
print(f"从段落样式获取字体大小: {font_info['size']}pt")
except Exception as e:
print(f"获取字体大小失败: {str(e)}")
font_info["size"] = 11 # 默认值
# 处理字体名称
if font.name:
font_info["name"] = font.name
print(f"从run.font获取字体: {font.name}")
else:
try:
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
if p_rpr.xpath('w:rFonts'):
try:
font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:eastAsia")]
print(f"从段落样式w:eastAsia获取字体: {font_info['name']}")
except:
font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:ascii")]
print(f"从段落样式w:ascii获取字体: {font_info['name']}")
except Exception as e:
print(f"获取字体失败: {str(e)}")
font_info["name"] = "Calibri" # 默认值
return font_info
def get_color_info(run, paragraph):
"""增强版颜色获取优先从run获取失败则从段落样式获取"""
color_info = None
# 1. 首先尝试从run.font获取颜色
try:
if run.font.color and hasattr(run.font.color, 'rgb'):
color = run.font.color
color_info = {
"r": (color.rgb >> 16) & 0xff,
"g": (color.rgb >> 8) & 0xff,
"b": color.rgb & 0xff
}
print(f"从run.font获取颜色: RGB({color_info['r']}, {color_info['g']}, {color_info['b']})")
except Exception as e:
print(f"从run.font获取颜色失败: {str(e)}")
# 2. 如果run颜色为空尝试从段落样式中获取
if color_info is None:
try:
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
if p_rpr.xpath('w:color'):
color_attr = p_rpr.xpath('w:color')[0].attrib
if 'w:val' in color_attr:
hex_color = color_attr[qn('w:val')]
if hex_color.startswith('FF'):
hex_color = hex_color[2:]
rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
color_info = {
"r": rgb[0],
"g": rgb[1],
"b": rgb[2]
}
print(f"从段落样式获取颜色: RGB{rgb}")
except Exception as e:
print(f"从段落样式获取颜色失败: {str(e)}")
return color_info
def get_cell_alignment(cell):
if cell.paragraphs:
return get_alignment_with_fallback(cell.paragraphs[0])
return "left"
def get_cell_border(cell):
# 默认返回实线边框
default_border = {
"top": {"style": "single", "size": 4, "color": "000000"},
"bottom": {"style": "single", "size": 4, "color": "000000"},
"left": {"style": "single", "size": 4, "color": "000000"},
"right": {"style": "single", "size": 4, "color": "000000"}
}
try:
# 尝试获取实际边框设置
tcPr = cell._element.tcPr
if tcPr is None:
return default_border
borders = {}
for side in ['top', 'bottom', 'left', 'right']:
border = tcPr.xpath(f'w:tcBorders/w:{side}')
if border:
border = border[0]
border_style = border.get(qn('w:val'), 'single')
border_size = border.get(qn('w:sz'), '4')
border_color = border.get(qn('w:color'), '000000')
borders[side] = {
"style": border_style,
"size": int(border_size),
"color": border_color
}
else:
borders[side] = default_border[side]
return borders
except Exception as e:
print(f"获取单元格边框失败: {str(e)}, 使用默认边框")
return default_border
def process_cell(cell):
cell_content = []
print(f"处理单元格,包含 {len(cell.paragraphs)} 个段落")
for para in cell.paragraphs:
cell_content.append(process_paragraph(para))
return cell_content
if __name__ == "__main__":
docx_path = r'D:\work\报告扫描\source.docx'
json_data = docx_to_json(docx_path)
with open("output.json", "w", encoding="utf-8") as f:
json.dump(json_data, f, ensure_ascii=False, indent=2)
print("转换完成结果已保存到output.json")