Report_Generate_Server/utils/document_utils.py

174 lines
5.4 KiB
Python

"""
Document utility functions for Word Document Server.
"""
import json
from typing import Dict, List, Any
from docx import Document
def get_document_properties(doc_path: str) -> Dict[str, Any]:
"""Get properties of a Word document."""
import os
if not os.path.exists(doc_path):
return {"error": f"Document {doc_path} does not exist"}
try:
doc = Document(doc_path)
core_props = doc.core_properties
return {
"title": core_props.title or "",
"author": core_props.author or "",
"subject": core_props.subject or "",
"keywords": core_props.keywords or "",
"created": str(core_props.created) if core_props.created else "",
"modified": str(core_props.modified) if core_props.modified else "",
"last_modified_by": core_props.last_modified_by or "",
"revision": core_props.revision or 0,
"page_count": len(doc.sections),
"word_count": sum(len(paragraph.text.split()) for paragraph in doc.paragraphs),
"paragraph_count": len(doc.paragraphs),
"table_count": len(doc.tables)
}
except Exception as e:
return {"error": f"Failed to get document properties: {str(e)}"}
def extract_document_text(doc_path: str) -> str:
"""Extract all text from a Word document."""
import os
if not os.path.exists(doc_path):
return f"Document {doc_path} does not exist"
try:
doc = Document(doc_path)
text = []
for paragraph in doc.paragraphs:
text.append(paragraph.text)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
text.append(paragraph.text)
return "\n".join(text)
except Exception as e:
return f"Failed to extract text: {str(e)}"
def get_document_structure(doc_path: str) -> Dict[str, Any]:
"""Get the structure of a Word document."""
import os
if not os.path.exists(doc_path):
return {"error": f"Document {doc_path} does not exist"}
try:
doc = Document(doc_path)
structure = {
"paragraphs": [],
"tables": []
}
# Get paragraphs
for i, para in enumerate(doc.paragraphs):
structure["paragraphs"].append({
"index": i,
"text": para.text[:100] + ("..." if len(para.text) > 100 else ""),
"style": para.style.name if para.style else "Normal"
})
# Get tables
for i, table in enumerate(doc.tables):
table_data = {
"index": i,
"rows": len(table.rows),
"columns": len(table.columns),
"preview": []
}
# Get sample of table data
max_rows = min(3, len(table.rows))
for row_idx in range(max_rows):
row_data = []
max_cols = min(3, len(table.columns))
for col_idx in range(max_cols):
try:
cell_text = table.cell(row_idx, col_idx).text
row_data.append(cell_text[:20] + ("..." if len(cell_text) > 20 else ""))
except IndexError:
row_data.append("N/A")
table_data["preview"].append(row_data)
structure["tables"].append(table_data)
return structure
except Exception as e:
return {"error": f"Failed to get document structure: {str(e)}"}
def find_paragraph_by_text(doc, text, partial_match=False):
"""
Find paragraphs containing specific text.
Args:
doc: Document object
text: Text to search for
partial_match: If True, matches paragraphs containing the text; if False, matches exact text
Returns:
List of paragraph indices that match the criteria
"""
matching_paragraphs = []
for i, para in enumerate(doc.paragraphs):
if partial_match and text in para.text:
matching_paragraphs.append(i)
elif not partial_match and para.text == text:
matching_paragraphs.append(i)
return matching_paragraphs
def find_and_replace_text(doc, old_text, new_text):
"""
Find and replace text throughout the document.
Args:
doc: Document object
old_text: Text to find
new_text: Text to replace with
Returns:
Number of replacements made
"""
count = 0
# Search in paragraphs
for para in doc.paragraphs:
if old_text in para.text:
for run in para.runs:
if old_text in run.text:
run.text = run.text.replace(old_text, new_text)
count += 1
# Search in tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
if old_text in para.text:
for run in para.runs:
if old_text in run.text:
run.text = run.text.replace(old_text, new_text)
count += 1
return count
def clear_header(section):
for para in section.header.paragraphs:
para.clear()