168 lines
5.3 KiB
Python
168 lines
5.3 KiB
Python
"""
|
|
Document utility functions for Word Document Server.
|
|
"""
|
|
import json
|
|
from typing import Dict, List, Any
|
|
from docx import Document
|
|
|
|
|
|
def get_document_properties(doc_path: str) -> Dict[str, Any]:
|
|
"""Get properties of a Word document."""
|
|
import os
|
|
if not os.path.exists(doc_path):
|
|
return {"error": f"Document {doc_path} does not exist"}
|
|
|
|
try:
|
|
doc = Document(doc_path)
|
|
core_props = doc.core_properties
|
|
|
|
return {
|
|
"title": core_props.title or "",
|
|
"author": core_props.author or "",
|
|
"subject": core_props.subject or "",
|
|
"keywords": core_props.keywords or "",
|
|
"created": str(core_props.created) if core_props.created else "",
|
|
"modified": str(core_props.modified) if core_props.modified else "",
|
|
"last_modified_by": core_props.last_modified_by or "",
|
|
"revision": core_props.revision or 0,
|
|
"page_count": len(doc.sections),
|
|
"word_count": sum(len(paragraph.text.split()) for paragraph in doc.paragraphs),
|
|
"paragraph_count": len(doc.paragraphs),
|
|
"table_count": len(doc.tables)
|
|
}
|
|
except Exception as e:
|
|
return {"error": f"Failed to get document properties: {str(e)}"}
|
|
|
|
|
|
def extract_document_text(doc_path: str) -> str:
|
|
"""Extract all text from a Word document."""
|
|
import os
|
|
if not os.path.exists(doc_path):
|
|
return f"Document {doc_path} does not exist"
|
|
|
|
try:
|
|
doc = Document(doc_path)
|
|
text = []
|
|
|
|
for paragraph in doc.paragraphs:
|
|
text.append(paragraph.text)
|
|
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
for paragraph in cell.paragraphs:
|
|
text.append(paragraph.text)
|
|
|
|
return "\n".join(text)
|
|
except Exception as e:
|
|
return f"Failed to extract text: {str(e)}"
|
|
|
|
|
|
def get_document_structure(doc_path: str) -> Dict[str, Any]:
|
|
"""Get the structure of a Word document."""
|
|
import os
|
|
if not os.path.exists(doc_path):
|
|
return {"error": f"Document {doc_path} does not exist"}
|
|
|
|
try:
|
|
doc = Document(doc_path)
|
|
structure = {
|
|
"paragraphs": [],
|
|
"tables": []
|
|
}
|
|
|
|
# Get paragraphs
|
|
for i, para in enumerate(doc.paragraphs):
|
|
structure["paragraphs"].append({
|
|
"index": i,
|
|
"text": para.text[:100] + ("..." if len(para.text) > 100 else ""),
|
|
"style": para.style.name if para.style else "Normal"
|
|
})
|
|
|
|
# Get tables
|
|
for i, table in enumerate(doc.tables):
|
|
table_data = {
|
|
"index": i,
|
|
"rows": len(table.rows),
|
|
"columns": len(table.columns),
|
|
"preview": []
|
|
}
|
|
|
|
# Get sample of table data
|
|
max_rows = min(3, len(table.rows))
|
|
for row_idx in range(max_rows):
|
|
row_data = []
|
|
max_cols = min(3, len(table.columns))
|
|
for col_idx in range(max_cols):
|
|
try:
|
|
cell_text = table.cell(row_idx, col_idx).text
|
|
row_data.append(cell_text[:20] + ("..." if len(cell_text) > 20 else ""))
|
|
except IndexError:
|
|
row_data.append("N/A")
|
|
table_data["preview"].append(row_data)
|
|
|
|
structure["tables"].append(table_data)
|
|
|
|
return structure
|
|
except Exception as e:
|
|
return {"error": f"Failed to get document structure: {str(e)}"}
|
|
|
|
|
|
def find_paragraph_by_text(doc, text, partial_match=False):
|
|
"""
|
|
Find paragraphs containing specific text.
|
|
|
|
Args:
|
|
doc: Document object
|
|
text: Text to search for
|
|
partial_match: If True, matches paragraphs containing the text; if False, matches exact text
|
|
|
|
Returns:
|
|
List of paragraph indices that match the criteria
|
|
"""
|
|
matching_paragraphs = []
|
|
|
|
for i, para in enumerate(doc.paragraphs):
|
|
if partial_match and text in para.text:
|
|
matching_paragraphs.append(i)
|
|
elif not partial_match and para.text == text:
|
|
matching_paragraphs.append(i)
|
|
|
|
return matching_paragraphs
|
|
|
|
|
|
def find_and_replace_text(doc, old_text, new_text):
|
|
"""
|
|
Find and replace text throughout the document.
|
|
|
|
Args:
|
|
doc: Document object
|
|
old_text: Text to find
|
|
new_text: Text to replace with
|
|
|
|
Returns:
|
|
Number of replacements made
|
|
"""
|
|
count = 0
|
|
|
|
# Search in paragraphs
|
|
for para in doc.paragraphs:
|
|
if old_text in para.text:
|
|
for run in para.runs:
|
|
if old_text in run.text:
|
|
run.text = run.text.replace(old_text, new_text)
|
|
count += 1
|
|
|
|
# Search in tables
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
for para in cell.paragraphs:
|
|
if old_text in para.text:
|
|
for run in para.runs:
|
|
if old_text in run.text:
|
|
run.text = run.text.replace(old_text, new_text)
|
|
count += 1
|
|
|
|
return count
|