删除docxjson项目
This commit is contained in:
parent
52634c1c7d
commit
ead688513c
Binary file not shown.
Binary file not shown.
|
@ -1,396 +0,0 @@
|
|||
import json
|
||||
from docx import Document
|
||||
from docx.oxml.shared import qn
|
||||
|
||||
|
||||
def docx_to_json(docx_path):
|
||||
print(f"\n开始解析文档: {docx_path}")
|
||||
doc = Document(docx_path)
|
||||
|
||||
result = []
|
||||
para_index = 0
|
||||
table_index = 0
|
||||
|
||||
print(f"文档包含 {len(doc.paragraphs)} 个段落和 {len(doc.tables)} 个表格")
|
||||
|
||||
for element in doc.element.body:
|
||||
if element.tag.endswith('p'):
|
||||
paragraph = doc.paragraphs[para_index]
|
||||
print(f"\n处理段落 {para_index}: {paragraph.text[:50]}...")
|
||||
para_data = process_paragraph(paragraph)
|
||||
result.append({
|
||||
"type": "text",
|
||||
"content": para_data
|
||||
})
|
||||
para_index += 1
|
||||
elif element.tag.endswith('tbl'):
|
||||
table = doc.tables[table_index]
|
||||
print(f"\n处理表格 {table_index} ({len(table.rows)}行×{len(table.columns)}列)")
|
||||
table_data = process_table_with_merge_info(table)
|
||||
result.append({
|
||||
"type": "table",
|
||||
"content": table_data
|
||||
# 移除了bold属性,加粗信息现在由每个run单独记录
|
||||
})
|
||||
table_index += 1
|
||||
|
||||
print("\n文档解析完成!")
|
||||
return result
|
||||
|
||||
def process_table_with_merge_info(table):
|
||||
"""处理表格并包含合并信息、行高列宽和完整格式"""
|
||||
table_data = {
|
||||
"rows": len(table.rows),
|
||||
"cols": len(table.columns),
|
||||
"cells": [],
|
||||
"merged_cells": [],
|
||||
"row_heights": [None] * len(table.rows),
|
||||
"col_widths": [None] * len(table.columns)
|
||||
}
|
||||
|
||||
# 先处理所有单元格内容
|
||||
cell_data_matrix = []
|
||||
for i, row in enumerate(table.rows):
|
||||
row_data = []
|
||||
for j, cell in enumerate(row.cells):
|
||||
cell_data = {
|
||||
"row": i,
|
||||
"col": j,
|
||||
"content": process_cell_content(cell),
|
||||
"alignment": get_cell_alignment(cell), # 获取单元格对齐
|
||||
"vertical_align": get_vertical_alignment(cell), # 新增垂直对齐
|
||||
"border": get_cell_border(cell),
|
||||
"shading": get_cell_shading(cell), # 新增背景色
|
||||
"margins": get_cell_margins(cell), # 新增边距
|
||||
"is_merged": False,
|
||||
"merge_info": None
|
||||
}
|
||||
row_data.append(cell_data)
|
||||
|
||||
# 记录行高列宽(最后一个单元格时记录)
|
||||
if j == len(row.cells) - 1 and row.height is not None:
|
||||
table_data["row_heights"][i] = row.height.inches
|
||||
if i == len(table.rows) - 1 and table.columns[j].width is not None:
|
||||
table_data["col_widths"][j] = table.columns[j].width.inches
|
||||
|
||||
cell_data_matrix.append(row_data)
|
||||
|
||||
# 检测合并单元格
|
||||
merge_ranges = []
|
||||
for i, row in enumerate(table.rows):
|
||||
for j, cell in enumerate(row.cells):
|
||||
# 跳过已经处理过的合并单元格
|
||||
if any((i >= r1 and i <= r2 and j >= c1 and j <= c2) for (r1, r2, c1, c2) in merge_ranges):
|
||||
continue
|
||||
|
||||
# 查找相同单元格地址的范围
|
||||
r2 = i
|
||||
c2 = j
|
||||
|
||||
# 水平方向查找
|
||||
while c2 + 1 < table_data["cols"] and table.rows[i].cells[c2 + 1]._tc is cell._tc:
|
||||
c2 += 1
|
||||
|
||||
# 垂直方向查找
|
||||
while r2 + 1 < table_data["rows"] and table.rows[r2 + 1].cells[j]._tc is cell._tc:
|
||||
r2 += 1
|
||||
|
||||
# 如果找到合并区域
|
||||
if r2 > i or c2 > j:
|
||||
merge_ranges.append((i, r2, j, c2))
|
||||
|
||||
# 更新主单元格信息
|
||||
cell_data_matrix[i][j]["is_merged"] = True
|
||||
cell_data_matrix[i][j]["merge_info"] = {
|
||||
"is_primary": True,
|
||||
"merge_range": f"{i},{j}-{r2},{c2}"
|
||||
}
|
||||
|
||||
# 添加到合并单元格列表
|
||||
table_data["merged_cells"].append({
|
||||
"start_row": i,
|
||||
"start_col": j,
|
||||
"end_row": r2,
|
||||
"end_col": c2,
|
||||
"content": process_cell_content(cell)
|
||||
})
|
||||
|
||||
# 更新被合并单元格信息
|
||||
for r in range(i, r2 + 1):
|
||||
for c in range(j, c2 + 1):
|
||||
if r != i or c != j: # 跳过主单元格
|
||||
cell_data_matrix[r][c]["is_merged"] = True
|
||||
cell_data_matrix[r][c]["merge_info"] = {
|
||||
"is_primary": False,
|
||||
"merge_range": f"{i},{j}-{r2},{c2}"
|
||||
}
|
||||
|
||||
# 将处理后的单元格数据添加到结果中
|
||||
table_data["cells"] = cell_data_matrix
|
||||
|
||||
return table_data
|
||||
|
||||
def get_vertical_alignment(cell):
|
||||
"""获取单元格垂直对齐方式"""
|
||||
try:
|
||||
tcPr = cell._element.tcPr
|
||||
if tcPr is not None:
|
||||
vAlign = tcPr.find(qn('w:vAlign'))
|
||||
if vAlign is not None:
|
||||
align_map = {
|
||||
'top': 'top',
|
||||
'center': 'center',
|
||||
'bottom': 'bottom'
|
||||
}
|
||||
return align_map.get(vAlign.get(qn('w:val')), 'top')
|
||||
except:
|
||||
print("获取垂直对齐方式失败")
|
||||
pass
|
||||
return 'top' # 默认顶部对齐
|
||||
|
||||
def get_cell_shading(cell):
|
||||
"""获取单元格背景色"""
|
||||
try:
|
||||
tcPr = cell._element.tcPr
|
||||
if tcPr is not None:
|
||||
shading = tcPr.find(qn('w:shd'))
|
||||
if shading is not None:
|
||||
color = shading.get(qn('w:fill'))
|
||||
if color:
|
||||
return {
|
||||
'color': color,
|
||||
'theme': shading.get(qn('w:themeColor'), '')
|
||||
}
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def get_cell_margins(cell):
|
||||
"""获取单元格边距"""
|
||||
margins = {}
|
||||
try:
|
||||
tcPr = cell._element.tcPr
|
||||
if tcPr is not None:
|
||||
for side in ['top', 'left', 'bottom', 'right']:
|
||||
margin = tcPr.find(qn(f'w:tcMar/w:{side}'))
|
||||
if margin is not None:
|
||||
margins[side] = {
|
||||
'w': margin.get(qn('w:w')),
|
||||
'type': margin.get(qn('w:type'))
|
||||
}
|
||||
except:
|
||||
pass
|
||||
return margins if margins else None
|
||||
|
||||
def process_cell_content(cell):
|
||||
"""处理单元格内容,直接调用段落处理函数"""
|
||||
cell_content = []
|
||||
for para in cell.paragraphs:
|
||||
# 复用段落处理逻辑
|
||||
para_data = process_paragraph(para)
|
||||
cell_content.append(para_data)
|
||||
return cell_content
|
||||
|
||||
def has_page_break(run):
|
||||
"""检查run是否包含分页符"""
|
||||
xml = run._element.xml
|
||||
return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text
|
||||
|
||||
def process_paragraph(paragraph):
|
||||
para_data = {
|
||||
"alignment": get_alignment_with_fallback(paragraph),
|
||||
"runs": []
|
||||
}
|
||||
|
||||
print(f"段落对齐方式: {para_data['alignment']}")
|
||||
|
||||
for run in paragraph.runs:
|
||||
run_data = {
|
||||
"text": run.text,
|
||||
"font": get_font_info(run, paragraph),
|
||||
"style": run.style.name if run.style else None,
|
||||
"has_page_break": has_page_break(run)
|
||||
}
|
||||
para_data["runs"].append(run_data)
|
||||
|
||||
print(f"段落包含 {len(para_data['runs'])} 个文本运行(runs)")
|
||||
return para_data
|
||||
|
||||
def get_alignment_with_fallback(paragraph):
|
||||
# 首先尝试直接从段落属性获取
|
||||
try:
|
||||
if paragraph.alignment is not None:
|
||||
alignment_map = {
|
||||
0: "left",
|
||||
1: "center",
|
||||
2: "right",
|
||||
3: "justify"
|
||||
}
|
||||
result = alignment_map.get(paragraph.alignment, "left")
|
||||
print(f"从paragraph.alignment获取对齐方式: {result}")
|
||||
return result
|
||||
except:
|
||||
# 如果段落alignment为None,尝试从样式获取
|
||||
try:
|
||||
p_pr = paragraph.style.element.xpath('w:pPr')[0]
|
||||
if p_pr.xpath('w:jc'):
|
||||
jc_attr = p_pr.xpath('w:jc')[0].attrib
|
||||
align_map = {
|
||||
"left": "left",
|
||||
"center": "center",
|
||||
"right": "right",
|
||||
"both": "justify",
|
||||
"start": "left",
|
||||
"end": "right"
|
||||
}
|
||||
result = align_map.get(jc_attr[qn('w:val')], "left")
|
||||
print(f"从段落样式w:jc获取对齐方式: {result}")
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"获取对齐方式失败: {str(e)}")
|
||||
|
||||
print("使用默认对齐方式(left)")
|
||||
return "left"
|
||||
|
||||
def get_font_info(run, paragraph):
|
||||
font = run.font
|
||||
font_info = {
|
||||
"name": None,
|
||||
"size": None,
|
||||
"bold": font.bold if font.bold is not None else False, # 默认为False
|
||||
"italic": font.italic,
|
||||
"underline": font.underline,
|
||||
"color": get_color_info(run, paragraph)
|
||||
}
|
||||
|
||||
# 处理字体大小
|
||||
if font.size:
|
||||
font_info["size"] = font.size.pt
|
||||
else:
|
||||
try:
|
||||
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
|
||||
if p_rpr.xpath('w:sz'):
|
||||
sz_attr = p_rpr.xpath('w:sz')[0].attrib
|
||||
font_info["size"] = int(sz_attr[qn('w:val')]) / 2 # 转换为pt
|
||||
print(f"从段落样式获取字体大小: {font_info['size']}pt")
|
||||
except Exception as e:
|
||||
print(f"获取字体大小失败: {str(e)}")
|
||||
font_info["size"] = 11 # 默认值
|
||||
|
||||
# 处理字体名称
|
||||
if font.name:
|
||||
font_info["name"] = font.name
|
||||
print(f"从run.font获取字体: {font.name}")
|
||||
else:
|
||||
try:
|
||||
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
|
||||
if p_rpr.xpath('w:rFonts'):
|
||||
try:
|
||||
font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:eastAsia")]
|
||||
print(f"从段落样式w:eastAsia获取字体: {font_info['name']}")
|
||||
except:
|
||||
font_info["name"] = p_rpr.xpath('w:rFonts')[0].attrib[qn("w:ascii")]
|
||||
print(f"从段落样式w:ascii获取字体: {font_info['name']}")
|
||||
except Exception as e:
|
||||
print(f"获取字体失败: {str(e)}")
|
||||
font_info["name"] = "Calibri" # 默认值
|
||||
|
||||
return font_info
|
||||
|
||||
def get_color_info(run, paragraph):
|
||||
"""增强版颜色获取,优先从run获取,失败则从段落样式获取"""
|
||||
color_info = None
|
||||
|
||||
# 1. 首先尝试从run.font获取颜色
|
||||
try:
|
||||
if run.font.color and hasattr(run.font.color, 'rgb'):
|
||||
color = run.font.color
|
||||
color_info = {
|
||||
"r": (color.rgb >> 16) & 0xff,
|
||||
"g": (color.rgb >> 8) & 0xff,
|
||||
"b": color.rgb & 0xff
|
||||
}
|
||||
print(f"从run.font获取颜色: RGB({color_info['r']}, {color_info['g']}, {color_info['b']})")
|
||||
except Exception as e:
|
||||
print(f"从run.font获取颜色失败: {str(e)}")
|
||||
|
||||
# 2. 如果run颜色为空,尝试从段落样式中获取
|
||||
if color_info is None:
|
||||
try:
|
||||
p_rpr = paragraph.style.element.xpath('w:rPr')[0]
|
||||
if p_rpr.xpath('w:color'):
|
||||
color_attr = p_rpr.xpath('w:color')[0].attrib
|
||||
if 'w:val' in color_attr:
|
||||
hex_color = color_attr[qn('w:val')]
|
||||
if hex_color.startswith('FF'):
|
||||
hex_color = hex_color[2:]
|
||||
rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
|
||||
color_info = {
|
||||
"r": rgb[0],
|
||||
"g": rgb[1],
|
||||
"b": rgb[2]
|
||||
}
|
||||
print(f"从段落样式获取颜色: RGB{rgb}")
|
||||
except Exception as e:
|
||||
print(f"从段落样式获取颜色失败: {str(e)}")
|
||||
|
||||
return color_info
|
||||
|
||||
def get_cell_alignment(cell):
|
||||
if cell.paragraphs:
|
||||
return get_alignment_with_fallback(cell.paragraphs[0])
|
||||
return "left"
|
||||
|
||||
def get_cell_border(cell):
|
||||
# 默认返回实线边框
|
||||
default_border = {
|
||||
"top": {"style": "single", "size": 4, "color": "000000"},
|
||||
"bottom": {"style": "single", "size": 4, "color": "000000"},
|
||||
"left": {"style": "single", "size": 4, "color": "000000"},
|
||||
"right": {"style": "single", "size": 4, "color": "000000"}
|
||||
}
|
||||
|
||||
try:
|
||||
# 尝试获取实际边框设置
|
||||
tcPr = cell._element.tcPr
|
||||
if tcPr is None:
|
||||
return default_border
|
||||
|
||||
borders = {}
|
||||
for side in ['top', 'bottom', 'left', 'right']:
|
||||
border = tcPr.xpath(f'w:tcBorders/w:{side}')
|
||||
if border:
|
||||
border = border[0]
|
||||
border_style = border.get(qn('w:val'), 'single')
|
||||
border_size = border.get(qn('w:sz'), '4')
|
||||
border_color = border.get(qn('w:color'), '000000')
|
||||
borders[side] = {
|
||||
"style": border_style,
|
||||
"size": int(border_size),
|
||||
"color": border_color
|
||||
}
|
||||
else:
|
||||
borders[side] = default_border[side]
|
||||
|
||||
return borders
|
||||
except Exception as e:
|
||||
print(f"获取单元格边框失败: {str(e)}, 使用默认边框")
|
||||
return default_border
|
||||
|
||||
def process_cell(cell):
|
||||
cell_content = []
|
||||
print(f"处理单元格,包含 {len(cell.paragraphs)} 个段落")
|
||||
|
||||
for para in cell.paragraphs:
|
||||
cell_content.append(process_paragraph(para))
|
||||
|
||||
return cell_content
|
||||
|
||||
if __name__ == "__main__":
|
||||
docx_path = r'D:\work\报告扫描\source.docx'
|
||||
json_data = docx_to_json(docx_path)
|
||||
|
||||
with open("output.json", "w", encoding="utf-8") as f:
|
||||
json.dump(json_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("转换完成,结果已保存到output.json")
|
|
@ -1,243 +0,0 @@
|
|||
from docx import Document
|
||||
from docx.shared import Pt, RGBColor
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.oxml.shared import qn, OxmlElement
|
||||
import json
|
||||
|
||||
def json_to_docx(json_data, output_path):
|
||||
print(f"\n开始转换JSON到DOCX文档,输出路径: {output_path}")
|
||||
doc = Document()
|
||||
total_elements = len(json_data)
|
||||
print(f"文档包含 {total_elements} 个元素(段落和表格)")
|
||||
|
||||
for i, element in enumerate(json_data, 1):
|
||||
print(f"\n处理元素 {i}/{total_elements}: ", end="")
|
||||
if element["type"] == "text":
|
||||
print(f"段落 (长度: {len(element['content']['runs'])}个runs)")
|
||||
add_paragraph_from_json(doc, element["content"])
|
||||
elif element["type"] == "table":
|
||||
rows = element["content"]["rows"]
|
||||
cols = element["content"]["cols"]
|
||||
merges = len(element["content"].get("merged_cells", []))
|
||||
print(f"表格 ({rows}行×{cols}列, 包含 {merges} 个合并单元格)")
|
||||
add_table_from_json(doc, element["content"], element.get("bold", False))
|
||||
|
||||
print("\n正在保存文档...")
|
||||
doc.save(output_path)
|
||||
print(f"文档已成功保存到 {output_path}")
|
||||
|
||||
def add_paragraph_from_json(doc, para_json):
|
||||
paragraph = doc.add_paragraph()
|
||||
print(f" 添加段落 (对齐: {para_json['alignment']})")
|
||||
|
||||
# 设置段落对齐方式
|
||||
alignment_map = {
|
||||
"left": WD_ALIGN_PARAGRAPH.LEFT,
|
||||
"center": WD_ALIGN_PARAGRAPH.CENTER,
|
||||
"right": WD_ALIGN_PARAGRAPH.RIGHT,
|
||||
"justify": WD_ALIGN_PARAGRAPH.JUSTIFY
|
||||
}
|
||||
paragraph.alignment = alignment_map.get(para_json["alignment"], WD_ALIGN_PARAGRAPH.LEFT)
|
||||
|
||||
# 添加文本运行(runs)
|
||||
for run_idx, run_json in enumerate(para_json["runs"], 1):
|
||||
run = paragraph.add_run(run_json["text"])
|
||||
if run_json["has_page_break"]:
|
||||
import docx
|
||||
run.add_break(docx.enum.text.WD_BREAK.PAGE)
|
||||
font = run.font
|
||||
|
||||
print(f" 添加run {run_idx}: '{run_json['text']}' "
|
||||
f"(字体: {run_json['font']['name']}, 大小: {run_json['font']['size']}, "
|
||||
f"加粗: {run_json['font']['bold']}, 斜体: {run_json['font']['italic']})")
|
||||
|
||||
# 设置字体样式
|
||||
if run_json["font"]["name"]:
|
||||
font.name = run_json["font"]["name"]
|
||||
run.element.rPr.rFonts.set(qn('w:eastAsia'), run_json["font"]["name"])
|
||||
|
||||
if run_json["font"]["size"]:
|
||||
font.size = Pt(run_json["font"]["size"])
|
||||
|
||||
font.bold = run_json["font"]["bold"]
|
||||
font.italic = run_json["font"]["italic"]
|
||||
font.underline = run_json["font"]["underline"]
|
||||
|
||||
# 设置字体颜色
|
||||
if run_json["font"]["color"]:
|
||||
color = run_json["font"]["color"]
|
||||
font.color.rgb = RGBColor(color["r"], color["g"], color["b"])
|
||||
print(f" 设置颜色: RGB({color['r']}, {color['g']}, {color['b']})")
|
||||
|
||||
def add_table_from_json(doc, table_json, bold=False):
|
||||
print(f" 创建表格: {table_json['rows']}行 × {table_json['cols']}列")
|
||||
table = doc.add_table(rows=table_json["rows"], cols=table_json["cols"])
|
||||
|
||||
# 设置表格样式为无网格线(我们将自定义边框)
|
||||
table.style = 'Table Grid'
|
||||
|
||||
# 设置列宽
|
||||
if "col_widths" in table_json and any(table_json["col_widths"]):
|
||||
print(" 设置列宽...")
|
||||
for col_idx, width in enumerate(table_json["col_widths"]):
|
||||
if width is not None:
|
||||
# 将英寸转换为Twips(1英寸=1440 Twips)
|
||||
twips_width = int(width * 1440)
|
||||
for cell in table.columns[col_idx].cells:
|
||||
tc = cell._tc
|
||||
tcPr = tc.get_or_add_tcPr()
|
||||
tcW = tcPr.first_child_found_in("w:tcW")
|
||||
if tcW is None:
|
||||
tcW = OxmlElement('w:tcW')
|
||||
tcPr.append(tcW)
|
||||
tcW.set(qn('w:w'), str(twips_width))
|
||||
tcW.set(qn('w:type'), 'dxa') # 使用绝对单位
|
||||
|
||||
# 设置行高
|
||||
if "row_heights" in table_json and any(table_json["row_heights"]):
|
||||
print(" 设置行高...")
|
||||
for row_idx, height in enumerate(table_json["row_heights"]):
|
||||
if height is not None:
|
||||
# 将英寸转换为Twips(1英寸=1440 Twips)
|
||||
twips_height = int(height * 1440)
|
||||
tr = table.rows[row_idx]._tr
|
||||
trPr = tr.get_or_add_trPr()
|
||||
trHeight = OxmlElement('w:trHeight')
|
||||
trHeight.set(qn('w:val'), str(twips_height))
|
||||
trHeight.set(qn('w:hRule'), 'atLeast') # 或'exact'表示固定高度
|
||||
trPr.append(trHeight)
|
||||
|
||||
# 处理合并单元格
|
||||
for merge_idx, merge_info in enumerate(table_json.get("merged_cells", []), 1):
|
||||
start_row = merge_info["start_row"]
|
||||
start_col = merge_info["start_col"]
|
||||
end_row = merge_info["end_row"]
|
||||
end_col = merge_info["end_col"]
|
||||
|
||||
print(f" 合并单元格 #{merge_idx}: 从({start_row},{start_col})到({end_row},{end_col})")
|
||||
|
||||
start_cell = table.cell(start_row, start_col)
|
||||
end_cell = table.cell(end_row, end_col)
|
||||
start_cell.merge(end_cell)
|
||||
|
||||
# 填充表格内容
|
||||
for row_idx, row_data in enumerate(table_json["cells"]):
|
||||
for col_idx, cell_data in enumerate(row_data):
|
||||
# 跳过被合并的非主单元格
|
||||
if cell_data["is_merged"] and not cell_data["merge_info"]["is_primary"]:
|
||||
print(f" 跳过被合并的单元格({row_idx},{col_idx})")
|
||||
continue
|
||||
|
||||
cell = table.cell(cell_data["row"], cell_data["col"])
|
||||
print(f" 处理单元格({row_idx},{col_idx}) - 对齐: {cell_data['alignment']}")
|
||||
format_cell(cell, cell_data) # 统一设置单元格格式
|
||||
|
||||
def format_cell(cell, cell_data):
|
||||
"""设置单元格完整格式"""
|
||||
# 清空原有内容
|
||||
for p in cell.paragraphs:
|
||||
p._element.getparent().remove(p._element)
|
||||
|
||||
# 添加内容
|
||||
for para in cell_data["content"]:
|
||||
add_paragraph_from_json(cell, para)
|
||||
|
||||
# 设置对齐方式
|
||||
set_cell_alignment(cell, cell_data)
|
||||
|
||||
# 设置边框
|
||||
set_cell_border(cell, cell_data["border"])
|
||||
|
||||
# 设置背景色
|
||||
if cell_data.get("shading"):
|
||||
set_cell_shading(cell, cell_data["shading"])
|
||||
|
||||
# 设置边距
|
||||
if cell_data.get("margins"):
|
||||
set_cell_margins(cell, cell_data["margins"])
|
||||
|
||||
def set_cell_alignment(cell, cell_data):
|
||||
"""设置单元格对齐(水平和垂直)"""
|
||||
# 水平对齐
|
||||
if cell.paragraphs:
|
||||
align_map = {
|
||||
"left": WD_ALIGN_PARAGRAPH.LEFT,
|
||||
"center": WD_ALIGN_PARAGRAPH.CENTER,
|
||||
"right": WD_ALIGN_PARAGRAPH.RIGHT,
|
||||
"justify": WD_ALIGN_PARAGRAPH.JUSTIFY
|
||||
}
|
||||
cell.paragraphs[0].alignment = align_map.get(cell_data["alignment"], WD_ALIGN_PARAGRAPH.LEFT)
|
||||
|
||||
# 垂直对齐设置
|
||||
tcPr = cell._tc.get_or_add_tcPr()
|
||||
vAlign = OxmlElement('w:vAlign')
|
||||
align_value = cell_data.get('vertical_align', 'top')
|
||||
print(f" 设置垂直对齐: {align_value}")
|
||||
|
||||
# 确保使用有效的对齐值
|
||||
valid_alignments = ['top', 'center', 'bottom']
|
||||
if align_value not in valid_alignments:
|
||||
align_value = 'top' # 默认值
|
||||
|
||||
vAlign.set(qn('w:val'), align_value)
|
||||
tcPr.append(vAlign)
|
||||
|
||||
def set_cell_shading(cell, shading):
|
||||
"""设置单元格背景色"""
|
||||
tcPr = cell._tc.get_or_add_tcPr()
|
||||
shd = OxmlElement('w:shd')
|
||||
shd.set(qn('w:fill'), shading["color"])
|
||||
if shading.get("theme"):
|
||||
shd.set(qn('w:themeColor'), shading["theme"])
|
||||
tcPr.append(shd)
|
||||
|
||||
def set_cell_margins(cell, margins):
|
||||
"""设置单元格边距"""
|
||||
tcPr = cell._tc.get_or_add_tcPr()
|
||||
tcMar = OxmlElement('w:tcMar')
|
||||
|
||||
for side, margin in margins.items():
|
||||
side_el = OxmlElement(f'w:{side}')
|
||||
side_el.set(qn('w:w'), margin["w"])
|
||||
side_el.set(qn('w:type'), margin["type"])
|
||||
tcMar.append(side_el)
|
||||
|
||||
tcPr.append(tcMar)
|
||||
|
||||
def set_cell_border(cell, border_data):
|
||||
"""
|
||||
设置单元格边框
|
||||
:param cell: 单元格对象
|
||||
:param border_data: 边框数据
|
||||
"""
|
||||
tc = cell._tc
|
||||
tcPr = tc.get_or_add_tcPr()
|
||||
|
||||
# 检查是否存在边框元素,不存在则创建
|
||||
tcBorders = tcPr.first_child_found_in("w:tcBorders")
|
||||
if tcBorders is None:
|
||||
tcBorders = OxmlElement('w:tcBorders')
|
||||
tcPr.append(tcBorders)
|
||||
|
||||
# 设置各边边框
|
||||
for side in ['top', 'left', 'bottom', 'right']:
|
||||
if side in border_data:
|
||||
border = border_data[side]
|
||||
border_el = OxmlElement(f'w:{side}')
|
||||
border_el.set(qn('w:val'), border.get('style', 'single'))
|
||||
border_el.set(qn('w:sz'), str(border.get('size', 4)))
|
||||
border_el.set(qn('w:color'), border.get('color', '000000'))
|
||||
tcBorders.append(border_el)
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 假设我们已经有了之前生成的JSON数据
|
||||
input_json = "output.json"
|
||||
output_path = "restored.docx"
|
||||
|
||||
print(f"从 {input_json} 读取JSON数据...")
|
||||
with open(input_json, "r", encoding="utf-8") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
# 将JSON转换回DOCX
|
||||
json_to_docx(json_data, output_path)
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
|
@ -1,41 +0,0 @@
|
|||
from docx import Document
|
||||
from docx.enum.text import WD_BREAK
|
||||
import docx.oxml.shared as oxml
|
||||
|
||||
def find_and_mark_page_breaks(input_path, output_path):
|
||||
"""
|
||||
功能:检测文档中的分页符并在原位置添加标记
|
||||
|
||||
参数:
|
||||
input_path: 输入文档路径
|
||||
output_path: 输出文档路径
|
||||
"""
|
||||
doc = Document(input_path)
|
||||
|
||||
# 遍历所有段落
|
||||
for paragraph in doc.paragraphs:
|
||||
# 遍历段落中的所有runs
|
||||
for run in paragraph.runs:
|
||||
# 检查run的XML中是否包含分页符
|
||||
if has_page_break(run):
|
||||
print(f"发现分页符 - 段落内容: '{paragraph.text}'")
|
||||
|
||||
# 在原位置添加可见标记(可选)
|
||||
run.text = run.text.replace("\x0c", "[PAGE BREAK]")
|
||||
|
||||
# 如果要保留原分页符并添加新分页符
|
||||
run.add_break(WD_BREAK.PAGE)
|
||||
|
||||
# 保存修改后的文档
|
||||
doc.save(output_path)
|
||||
print(f"处理完成,结果已保存到: {output_path}")
|
||||
|
||||
def has_page_break(run):
|
||||
"""检查run是否包含分页符"""
|
||||
xml = run._element.xml
|
||||
return ('w:br' in xml and 'type="page"' in xml) or '\x0c' in run.text
|
||||
|
||||
# 使用示例
|
||||
input_file = "source.docx"
|
||||
output_file = "output_with_marks.docx"
|
||||
find_and_mark_page_breaks(input_file, output_file)
|
Loading…
Reference in New Issue