""" Document utility functions for Word Document Server. """ import json from typing import Dict, List, Any from docx import Document def get_document_properties(doc_path: str) -> Dict[str, Any]: """Get properties of a Word document.""" import os if not os.path.exists(doc_path): return {"error": f"Document {doc_path} does not exist"} try: doc = Document(doc_path) core_props = doc.core_properties return { "title": core_props.title or "", "author": core_props.author or "", "subject": core_props.subject or "", "keywords": core_props.keywords or "", "created": str(core_props.created) if core_props.created else "", "modified": str(core_props.modified) if core_props.modified else "", "last_modified_by": core_props.last_modified_by or "", "revision": core_props.revision or 0, "page_count": len(doc.sections), "word_count": sum(len(paragraph.text.split()) for paragraph in doc.paragraphs), "paragraph_count": len(doc.paragraphs), "table_count": len(doc.tables) } except Exception as e: return {"error": f"Failed to get document properties: {str(e)}"} def extract_document_text(doc_path: str) -> str: """Extract all text from a Word document.""" import os if not os.path.exists(doc_path): return f"Document {doc_path} does not exist" try: doc = Document(doc_path) text = [] for paragraph in doc.paragraphs: text.append(paragraph.text) for table in doc.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: text.append(paragraph.text) return "\n".join(text) except Exception as e: return f"Failed to extract text: {str(e)}" def get_document_structure(doc_path: str) -> Dict[str, Any]: """Get the structure of a Word document.""" import os if not os.path.exists(doc_path): return {"error": f"Document {doc_path} does not exist"} try: doc = Document(doc_path) structure = { "paragraphs": [], "tables": [] } # Get paragraphs for i, para in enumerate(doc.paragraphs): structure["paragraphs"].append({ "index": i, "text": para.text[:100] + ("..." if len(para.text) > 100 else ""), "style": para.style.name if para.style else "Normal" }) # Get tables for i, table in enumerate(doc.tables): table_data = { "index": i, "rows": len(table.rows), "columns": len(table.columns), "preview": [] } # Get sample of table data max_rows = min(3, len(table.rows)) for row_idx in range(max_rows): row_data = [] max_cols = min(3, len(table.columns)) for col_idx in range(max_cols): try: cell_text = table.cell(row_idx, col_idx).text row_data.append(cell_text[:20] + ("..." if len(cell_text) > 20 else "")) except IndexError: row_data.append("N/A") table_data["preview"].append(row_data) structure["tables"].append(table_data) return structure except Exception as e: return {"error": f"Failed to get document structure: {str(e)}"} def find_paragraph_by_text(doc, text, partial_match=False): """ Find paragraphs containing specific text. Args: doc: Document object text: Text to search for partial_match: If True, matches paragraphs containing the text; if False, matches exact text Returns: List of paragraph indices that match the criteria """ matching_paragraphs = [] for i, para in enumerate(doc.paragraphs): if partial_match and text in para.text: matching_paragraphs.append(i) elif not partial_match and para.text == text: matching_paragraphs.append(i) return matching_paragraphs def find_and_replace_text(doc, old_text, new_text): """ Find and replace text throughout the document. Args: doc: Document object old_text: Text to find new_text: Text to replace with Returns: Number of replacements made """ count = 0 # Search in paragraphs for para in doc.paragraphs: if old_text in para.text: for run in para.runs: if old_text in run.text: run.text = run.text.replace(old_text, new_text) count += 1 # Search in tables for table in doc.tables: for row in table.rows: for cell in row.cells: for para in cell.paragraphs: if old_text in para.text: for run in para.runs: if old_text in run.text: run.text = run.text.replace(old_text, new_text) count += 1 return count def clear_header(section): for para in section.header.paragraphs: para.clear()