import os import zipfile import shutil import logging from tempfile import mkdtemp from lxml import etree from copy import deepcopy # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('docx_merge_advanced.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) class AdvancedDocxMerger: def __init__(self, doc1_path, doc2_path, output_path): self.doc1_path = doc1_path self.doc2_path = doc2_path self.output_path = output_path self.temp_dir = mkdtemp(prefix='docx_merge_') self.doc1_dir = os.path.join(self.temp_dir, "doc1") self.doc2_dir = os.path.join(self.temp_dir, "doc2") self.merged_dir = os.path.join(self.temp_dir, "merged") # XML命名空间 self.ns = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing' } # XML解析器配置 self.parser = etree.XMLParser(remove_blank_text=True) logger.info(f"初始化合并器,临时目录: {self.temp_dir}") def _extract_docx(self): """解压两个Word文档到临时目录""" logger.info("开始解压文档...") try: os.makedirs(self.doc1_dir, exist_ok=True) os.makedirs(self.doc2_dir, exist_ok=True) with zipfile.ZipFile(self.doc1_path, 'r') as zip_ref: zip_ref.extractall(self.doc1_dir) logger.info(f"解压 {self.doc1_path} 完成") with zipfile.ZipFile(self.doc2_path, 'r') as zip_ref: zip_ref.extractall(self.doc2_dir) logger.info(f"解压 {self.doc2_path} 完成") return True except Exception as e: logger.error(f"解压失败: {str(e)}") return False def _prepare_merged_dir(self): """准备合并目录,初始复制doc1的全部内容""" logger.info("准备合并目录...") try: # 先完整复制doc1作为基础 shutil.copytree(self.doc1_dir, self.merged_dir) # 记录所有已存在的文件 self.existing_files = set() for root, _, files in os.walk(self.merged_dir): for file in files: rel_path = os.path.relpath(os.path.join(root, file), self.merged_dir) self.existing_files.add(rel_path.replace("\\", "/")) logger.info(f"初始合并目录准备完成,已有 {len(self.existing_files)} 个文件") return True except Exception as e: logger.error(f"准备合并目录失败: {str(e)}") return False def _is_xml_file(self, filename): """判断是否为XML文件""" return filename.endswith('.xml') or filename.endswith('.rels') def _merge_styles(self, root1, root2): """合并样式表""" try: # 收集现有样式ID existing_style_ids = {elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}styleId') for elem in root1 if elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}styleId')} # 合并新样式 added = 0 for style in root2: style_id = style.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}styleId') if style_id and style_id not in existing_style_ids: root1.append(deepcopy(style)) existing_style_ids.add(style_id) added += 1 logger.debug(f"添加了 {added} 个新样式") return True except Exception as e: logger.error(f"合并样式失败: {str(e)}") return False def _merge_numbering(self, root1, root2): """合并编号列表""" try: # 收集现有编号ID existing_num_ids = {num.get('numId') for num in root1.xpath('//w:num', namespaces=self.ns)} # 合并新编号 added = 0 for num in root2.xpath('//w:num', namespaces=self.ns): num_id = num.get('numId') if num_id and num_id not in existing_num_ids: root1.append(deepcopy(num)) existing_num_ids.add(num_id) added += 1 logger.debug(f"添加了 {added} 个新编号") return True except Exception as e: logger.error(f"合并编号失败: {str(e)}") return False def _merge_notes(self, root1, root2, note_type="footnote"): """合并脚注/尾注""" try: # 获取现有最大ID max_id = max((int(note.get('id', 0)) for note in root1.xpath(f'//w:{note_type}', namespaces=self.ns)) if root1.xpath(f'//w:{note_type}', namespaces=self.ns) else 0) # 合并新脚注/尾注 added = 0 for note in root2.xpath(f'//w:{note_type}', namespaces=self.ns): max_id += 1 new_note = deepcopy(note) new_note.set('id', str(max_id)) root1.append(new_note) added += 1 logger.debug(f"添加了 {added} 个新{note_type}") return True except Exception as e: logger.error(f"合并{note_type}失败: {str(e)}") return False def _merge_header_footer(self, root1, root2): """合并页眉页脚内容""" try: # # 简单追加所有内容 # for elem in root2: # root1.append(deepcopy(elem)) return True except Exception as e: logger.error(f"合并页眉页脚失败: {str(e)}") return False def _merge_settings(self, root1, root2): """合并文档设置,智能整合两个文档的配置""" try: # 需要合并的设置项及其处理策略 merge_strategies = { # 页面设置 'w:defaultTabStop': 'max', # 取较大的制表位宽度 'w:autoHyphenation': 'or', # 任一文档启用则启用 'w:consecutiveHyphenLimit': 'max', # 取较大的连字符限制 # 兼容性设置 'w:compat': 'merge', # 合并兼容性设置 'w:useFELayout': 'or', # 任一文档启用则启用 # 修订跟踪 'w:trackRevisions': 'or', # 任一文档启用则启用 'w:doNotTrackMoves': 'and', # 两文档都启用才启用 # 其他重要设置 'w:zoom': 'doc1', # 使用第一个文档的缩放设置 'w:mirrorMargins': 'or', # 任一文档启用则启用 } # 记录合并的更改 changes = [] # 处理每个设置项 for setting in root2: # 获取设置项名称 tag = setting.tag.split('}')[1] if '}' in setting.tag else setting.tag # 跳过不需要处理的设置 if tag not in merge_strategies: continue strategy = merge_strategies[tag] existing = root1.xpath(f'//w:{tag}', namespaces=self.ns) if not existing: # 如果doc1没有该设置,直接添加 root1.append(deepcopy(setting)) changes.append(f"添加 {tag} = {setting.get('val', '')}") else: # 根据策略合并设置 existing_setting = existing[0] if strategy == 'max': val1 = float(existing_setting.get('val', 0)) val2 = float(setting.get('val', 0)) if val2 > val1: existing_setting.set('val', str(val2)) changes.append(f"更新 {tag} 为较大值: {val1} → {val2}") elif strategy == 'or': if setting.get('val') == '1' and existing_setting.get('val') != '1': existing_setting.set('val', '1') changes.append(f"启用 {tag}") elif strategy == 'and': if setting.get('val') != '1' and existing_setting.get('val') == '1': existing_setting.set('val', '0') changes.append(f"禁用 {tag} (因doc2禁用)") elif strategy == 'merge' and tag == 'w:compat': # 合并兼容性设置 for child in setting: if not root1.xpath(f'//w:compat/w:{child.tag.split("}")[1]}', namespaces=self.ns): existing_setting.append(deepcopy(child)) changes.append(f"添加兼容性设置 {child.tag.split('}')[1]}") # 特殊处理:文档保护设置 doc_protection1 = root1.xpath('//w:documentProtection', namespaces=self.ns) doc_protection2 = root2.xpath('//w:documentProtection', namespaces=self.ns) if doc_protection2 and not doc_protection1: root1.append(deepcopy(doc_protection2[0])) changes.append("添加文档保护设置") # 特殊处理:拼写和语法检查设置 proof_state1 = root1.xpath('//w:proofState', namespaces=self.ns) proof_state2 = root2.xpath('//w:proofState', namespaces=self.ns) if proof_state2: if proof_state1: # 合并拼写检查状态 for attr in ['spelling', 'grammar']: if proof_state2[0].get(attr) == 'clean' and proof_state1[0].get(attr) != 'clean': proof_state1[0].set(attr, 'clean') changes.append(f"更新 {attr} 检查状态为 clean") else: root1.append(deepcopy(proof_state2[0])) changes.append("添加拼写检查设置") if changes: logger.info(f"合并文档设置,进行了 {len(changes)} 处更改:\n - " + "\n - ".join(changes)) else: logger.info("文档设置无需更改,保留第一个文档的设置") return True except Exception as e: logger.error(f"合并文档设置失败: {str(e)}") return False def _merge_relationships(self, root1, root2): """合并关系文件""" try: # 收集所有现有的关系ID existing_ids = {rel.get('Id') for rel in root1} # 找出最大的rId值 max_id = 0 for rel in root1: if rel.get('Id', '').startswith('rId'): try: current_id = int(rel.get('Id')[3:]) if current_id > max_id: max_id = current_id except ValueError: pass # 合并关系 added = 0 for rel in root2: rel_id = rel.get('Id') if rel_id not in existing_ids: # 生成新的唯一ID max_id += 1 new_id = f"rId{max_id}" rel.set('Id', new_id) root1.append(deepcopy(rel)) added += 1 logger.debug(f"添加了 {added} 个新关系") return True except Exception as e: logger.error(f"合并关系失败: {str(e)}") return False def _merge_xml_files(self, file1, file2, output_file): """合并两个XML文件""" try: # 特殊处理settings.xml和rels文件 if 'settings.xml' in output_file: if not os.path.exists(file2): return True tree1 = etree.parse(file1, self.parser) tree2 = etree.parse(file2, self.parser) return self._merge_settings(tree1.getroot(), tree2.getroot()) if '_rels' in output_file and output_file.endswith('.rels'): if not os.path.exists(file2): return True tree1 = etree.parse(file1, self.parser) tree2 = etree.parse(file2, self.parser) return self._merge_relationships(tree1.getroot(), tree2.getroot()) # 其他XML文件合并逻辑 if not os.path.exists(file2): return True tree1 = etree.parse(file1, self.parser) tree2 = etree.parse(file2, self.parser) root1 = tree1.getroot() root2 = tree2.getroot() # 特殊处理不同类型的XML文件 if 'document.xml' in output_file: self._merge_document_content(root1, root2) elif 'styles.xml' in output_file: self._merge_styles(root1, root2) elif 'footnotes.xml' in output_file: self._merge_notes(root1, root2, "footnote") elif 'endnotes.xml' in output_file: self._merge_notes(root1, root2, "endnote") elif 'numbering.xml' in output_file: self._merge_numbering(root1, root2) elif 'header' in output_file or 'footer' in output_file: self._merge_header_footer(root1, root2) else: # 默认合并策略:追加所有子元素 for child in root2: root1.append(deepcopy(child)) # 保存合并后的XML tree1.write(output_file, encoding='UTF-8', xml_declaration=True) logger.debug(f"成功合并XML文件: {output_file}") return True except Exception as e: logger.error(f"合并XML文件失败 {output_file}: {str(e)}") return False def _merge_document_content(self, root1, root2): """合并文档正文内容""" try: body1 = root1.xpath("//w:body", namespaces=self.ns) body2 = root2.xpath("//w:body", namespaces=self.ns) if not body1 or not body2: logger.warning("文档缺少body元素") return body1 = body1[0] body2 = body2[0] # 在合并前添加分节符保持格式 sect_prs = body1.xpath(".//w:sectPr", namespaces=self.ns) if sect_prs: sect_pr = sect_prs[-1] new_p = etree.Element("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p") new_p.append(deepcopy(sect_pr)) body1.append(new_p) # 合并内容 for elem in body2: body1.append(deepcopy(elem)) except Exception as e: logger.error(f"合并文档内容失败: {str(e)}") raise def _deep_merge_docx(self): """深度合并两个文档的所有文件""" logger.info("开始深度合并文档...") # 遍历doc2的所有文件 for root, _, files in os.walk(self.doc2_dir): for file in files: src_file = os.path.join(root, file) rel_path = os.path.relpath(src_file, self.doc2_dir) dest_file = os.path.join(self.merged_dir, rel_path) # 标准化路径比较 norm_rel_path = rel_path.replace("\\", "/") if norm_rel_path not in self.existing_files: # 新文件,直接复制 os.makedirs(os.path.dirname(dest_file), exist_ok=True) shutil.copy2(src_file, dest_file) logger.debug(f"复制新文件: {norm_rel_path}") else: # 已存在文件,判断是否需要合并 if self._is_xml_file(file): # XML文件需要合并 existing_file = os.path.join(self.merged_dir, rel_path) if os.path.exists(existing_file): if not self._merge_xml_files(existing_file, src_file, dest_file): logger.warning(f"合并失败,保留原文件: {norm_rel_path}") else: # 非XML文件,保留原文件 logger.debug(f"文件已存在,跳过: {norm_rel_path}") logger.info("深度合并完成") return True def _repack_docx(self): """重新打包为新的docx文件""" logger.info("开始重新打包文档...") try: with zipfile.ZipFile(self.output_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, _, files in os.walk(self.merged_dir): for file in files: file_path = os.path.join(root, file) arcname = os.path.relpath(file_path, self.merged_dir) zipf.write(file_path, arcname) logger.info(f"成功创建合并文档: {self.output_path}") return True except Exception as e: logger.error(f"重新打包失败: {str(e)}") return False def _cleanup(self): """清理临时文件""" try: shutil.rmtree(self.temp_dir) logger.info("已清理临时文件") except Exception as e: logger.warning(f"清理临时文件失败: {str(e)}") def merge(self): """执行合并流程""" logger.info(f"开始合并文档: {self.doc1_path} + {self.doc2_path} -> {self.output_path}") if not self._extract_docx(): return False if not self._prepare_merged_dir(): return False if not self._deep_merge_docx(): return False if not self._repack_docx(): return False self._cleanup() logger.info("文档合并成功完成!") return True if __name__ == "__main__": # 使用示例 merger = AdvancedDocxMerger( doc1_path="jingfeng_fengmian1.docx", doc2_path="quexian.docx", output_path="merged_document.docx" ) if merger.merge(): print("合并成功!") else: print("合并过程中出现错误,请查看日志文件。")