import json from docx import Document from docx.oxml.shared import qn def docx_to_json(docx_path): print(f"\n开始解析文档: {docx_path}") doc = Document(docx_path) result = [] para_index = 0 table_index = 0 print(f"文档包含 {len(doc.paragraphs)} 个段落和 {len(doc.tables)} 个表格") for element in doc.element.body: if element.tag.endswith('p'): paragraph = doc.paragraphs[para_index] print(f"\n处理段落 {para_index}: {paragraph.text[:50]}...") para_data = process_paragraph(paragraph) result.append({ "type": "text", "content": para_data }) para_index += 1 elif element.tag.endswith('tbl'): table = doc.tables[table_index] print(f"\n处理表格 {table_index} ({len(table.rows)}行×{len(table.columns)}列)") table_data = process_table_with_merge_info(table) result.append({ "type": "table", "content": table_data # 移除了bold属性,加粗信息现在由每个run单独记录 }) table_index += 1 print("\n文档解析完成!") return result def process_table_with_merge_info(table): """处理表格并包含合并信息、行高列宽和完整格式""" table_data = { "rows": len(table.rows), "cols": len(table.columns), "cells": [], "merged_cells": [], "row_heights": [None] * len(table.rows), "col_widths": [None] * len(table.columns) } # 先处理所有单元格内容 cell_data_matrix = [] for i, row in enumerate(table.rows): row_data = [] for j, cell in enumerate(row.cells): cell_data = { "row": i, "col": j, "content": process_cell_content(cell), "alignment": get_cell_alignment(cell), # 获取单元格对齐 "vertical_align": get_vertical_alignment(cell), # 新增垂直对齐 "border": get_cell_border(cell), "shading": get_cell_shading(cell), # 新增背景色 "margins": get_cell_margins(cell), # 新增边距 "is_merged": False, "merge_info": None } row_data.append(cell_data) # 记录行高列宽(最后一个单元格时记录) if j == len(row.cells) - 1 and row.height is not None: table_data["row_heights"][i] = row.height.inches if i == len(table.rows) - 1 and table.columns[j].width is not None: table_data["col_widths"][j] = table.columns[j].width.inches cell_data_matrix.append(row_data) # 检测合并单元格 merge_ranges = [] for i, row in enumerate(table.rows): for j, cell in enumerate(row.cells): # 跳过已经处理过的合并单元格 if any((i >= r1 and i <= r2 and j >= c1 and j <= c2) for (r1, r2, c1, c2) in merge_ranges): continue # 查找相同单元格地址的范围 r2 = i c2 = j # 水平方向查找 while c2 + 1 < table_data["cols"] and table.rows[i].cells[c2 + 1]._tc is cell._tc: c2 += 1 # 垂直方向查找 while r2 + 1 < table_data["rows"] and table.rows[r2 + 1].cells[j]._tc is cell._tc: r2 += 1 # 如果找到合并区域 if r2 > i or c2 > j: merge_ranges.append((i, r2, j, c2)) # 更新主单元格信息 cell_data_matrix[i][j]["is_merged"] = True cell_data_matrix[i][j]["merge_info"] = { "is_primary": True, "merge_range": f"{i},{j}-{r2},{c2}" } # 添加到合并单元格列表 table_data["merged_cells"].append({ "start_row": i, "start_col": j, "end_row": r2, "end_col": c2, "content": process_cell_content(cell) }) # 更新被合并单元格信息 for r in range(i, r2 + 1): for c in range(j, c2 + 1): if r != i or c != j: # 跳过主单元格 cell_data_matrix[r][c]["is_merged"] = True cell_data_matrix[r][c]["merge_info"] = { "is_primary": False, "merge_range": f"{i},{j}-{r2},{c2}" } # 将处理后的单元格数据添加到结果中 table_data["cells"] = cell_data_matrix return table_data def get_vertical_alignment(cell): """获取单元格垂直对齐方式""" try: tcPr = cell._element.tcPr if tcPr is not None: vAlign = tcPr.find(qn('w:vAlign')) if vAlign is not None: align_map = { 'top': 'top', 'center': 'center', 'bottom': 'bottom' } return align_map.get(vAlign.get(qn('w:val')), 'top') except: print("获取垂直对齐方式失败") pass return 'top' # 默认顶部对齐 def get_cell_shading(cell): """获取单元格背景色""" try: tcPr = cell._element.tcPr if tcPr is not None: shading = tcPr.find(qn('w:shd')) if shading is not None: color = shading.get(qn('w:fill')) if color: return { 'color': color, 'theme': shading.get(qn('w:themeColor'), '') } except: pass return None def get_cell_margins(cell): """获取单元格边距""" margins = {} try: tcPr = cell._element.tcPr if tcPr is not None: for side in ['top', 'left', 'bottom', 'right']: margin = tcPr.find(qn(f'w:tcMar/w:{side}')) if margin is not None: margins[side] = { 'w': margin.get(qn('w:w')), 'type': margin.get(qn('w:type')) } except: pass return margins if margins else None def process_cell_content(cell): """处理单元格内容,直接调用段落处理函数""" cell_content = [] for para in cell.paragraphs: # 复用段落处理逻辑 para_data = process_paragraph(para) cell_content.append(para_data) return cell_content def has_page_break(run): """检查run是否包含分页符""" xml = run._element.xml return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text def process_paragraph(paragraph): para_data = { "alignment": get_alignment_with_fallback(paragraph), "runs": [] } print(f"段落对齐方式: {para_data['alignment']}") for run in paragraph.runs: run_data = { "text": run.text, "font": get_font_info(run, paragraph), "style": run.style.name if run.style else None, "has_page_break": has_page_break(run) } para_data["runs"].append(run_data) print(f"段落包含 {len(para_data['runs'])} 个文本运行(runs)") return para_data def get_alignment_with_fallback(paragraph): # 首先尝试直接从段落属性获取 try: if paragraph.alignment is not None: alignment_map = { 0: "left", 1: "center", 2: "right", 3: "justify" } result = alignment_map.get(paragraph.alignment, "left") print(f"从paragraph.alignment获取对齐方式: {result}") return result except: # 如果段落alignment为None,尝试从样式获取 try: p_pr = paragraph.style.element.xpath('w:pPr')[0] if p_pr.xpath('w:jc'): jc_attr = p_pr.xpath('w:jc')[0].attrib align_map = { "left": "left", "center": "center", "right": "right", "both": "justify", "start": "left", "end": "right" } result = align_map.get(jc_attr[qn('w:val')], "left") print(f"从段落样式w:jc获取对齐方式: {result}") return result except Exception as e: print(f"获取对齐方式失败: {str(e)}") print("使用默认对齐方式(left)") return "left" def get_font_info(run, paragraph): font = run.font font_info = { "name": None, "size": None, "bold": font.bold if font.bold is not None else False, # 默认为False "italic": font.italic, "underline": font.underline, "color": get_color_info(run, paragraph) } # 处理字体大小 if font.size: font_info["size"] = font.size.pt else: try: p_rpr = paragraph.style.element.xpath('w:rPr')[0] if p_rpr.xpath('w:sz'): sz_attr = p_rpr.xpath('w:sz')[0].attrib font_info["size"] = int(sz_attr[qn('w:val')]) / 2 # 转换为pt print(f"从段落样式获取字体大小: {font_info['size']}pt") except Exception as e: print(f"获取字体大小失败: {str(e)}") font_info["size"] = 11 # 默认值 # 处理字体名称 if font.name: font_info["name"] = font.name print(f"从run.font获取字体: {font.name}") else: try: p_rpr = paragraph.style.element.xpath('w:rPr')[0] if p_rpr.xpath('w:rFonts'): try: font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:eastAsia")] print(f"从段落样式w:eastAsia获取字体: {font_info['name']}") except: font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:ascii")] print(f"从段落样式w:ascii获取字体: {font_info['name']}") except Exception as e: print(f"获取字体失败: {str(e)}") font_info["name"] = "Calibri" # 默认值 return font_info def get_color_info(run, paragraph): """增强版颜色获取,优先从run获取,失败则从段落样式获取""" color_info = None # 1. 首先尝试从run.font获取颜色 try: if run.font.color and hasattr(run.font.color, 'rgb'): color = run.font.color color_info = { "r": (color.rgb >> 16) & 0xff, "g": (color.rgb >> 8) & 0xff, "b": color.rgb & 0xff } print(f"从run.font获取颜色: RGB({color_info['r']}, {color_info['g']}, {color_info['b']})") except Exception as e: print(f"从run.font获取颜色失败: {str(e)}") # 2. 如果run颜色为空,尝试从段落样式中获取 if color_info is None: try: p_rpr = paragraph.style.element.xpath('w:rPr')[0] if p_rpr.xpath('w:color'): color_attr = p_rpr.xpath('w:color')[0].attrib if 'w:val' in color_attr: hex_color = color_attr[qn('w:val')] if hex_color.startswith('FF'): hex_color = hex_color[2:] rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) color_info = { "r": rgb[0], "g": rgb[1], "b": rgb[2] } print(f"从段落样式获取颜色: RGB{rgb}") except Exception as e: print(f"从段落样式获取颜色失败: {str(e)}") return color_info def get_cell_alignment(cell): if cell.paragraphs: return get_alignment_with_fallback(cell.paragraphs[0]) return "left" def get_cell_border(cell): # 默认返回实线边框 default_border = { "top": {"style": "single", "size": 4, "color": "000000"}, "bottom": {"style": "single", "size": 4, "color": "000000"}, "left": {"style": "single", "size": 4, "color": "000000"}, "right": {"style": "single", "size": 4, "color": "000000"} } try: # 尝试获取实际边框设置 tcPr = cell._element.tcPr if tcPr is None: return default_border borders = {} for side in ['top', 'bottom', 'left', 'right']: border = tcPr.xpath(f'w:tcBorders/w:{side}') if border: border = border[0] border_style = border.get(qn('w:val'), 'single') border_size = border.get(qn('w:sz'), '4') border_color = border.get(qn('w:color'), '000000') borders[side] = { "style": border_style, "size": int(border_size), "color": border_color } else: borders[side] = default_border[side] return borders except Exception as e: print(f"获取单元格边框失败: {str(e)}, 使用默认边框") return default_border def process_cell(cell): cell_content = [] print(f"处理单元格,包含 {len(cell.paragraphs)} 个段落") for para in cell.paragraphs: cell_content.append(process_paragraph(para)) return cell_content if __name__ == "__main__": docx_path = r'D:\work\报告扫描\source.docx' json_data = docx_to_json(docx_path) with open("output.json", "w", encoding="utf-8") as f: json.dump(json_data, f, ensure_ascii=False, indent=2) print("转换完成,结果已保存到output.json")