Report_Generate_Server/utils/extended_document_utils.py

166 lines
6.8 KiB
Python

"""
Extended document utilities for Word Document Server.
"""
from typing import Dict, List, Any, Tuple
from docx import Document
def get_paragraph_text(doc_path: str, paragraph_index: int) -> Dict[str, Any]:
"""
Get text from a specific paragraph in a Word document.
Args:
doc_path: Path to the Word document
paragraph_index: Index of the paragraph to extract (0-based)
Returns:
Dictionary with paragraph text and metadata
"""
import os
if not os.path.exists(doc_path):
return {"error": f"Document {doc_path} does not exist"}
try:
doc = Document(doc_path)
# Check if paragraph index is valid
if paragraph_index < 0 or paragraph_index >= len(doc.paragraphs):
return {"error": f"Invalid paragraph index: {paragraph_index}. Document has {len(doc.paragraphs)} paragraphs."}
paragraph = doc.paragraphs[paragraph_index]
return {
"index": paragraph_index,
"text": paragraph.text,
"style": paragraph.style.name if paragraph.style else "Normal",
"is_heading": paragraph.style.name.startswith("Heading") if paragraph.style else False
}
except Exception as e:
return {"error": f"Failed to get paragraph text: {str(e)}"}
def find_text(doc_path: str, text_to_find: str, match_case: bool = True, whole_word: bool = False) -> Dict[str, Any]:
"""
Find all occurrences of specific text in a Word document.
Args:
doc_path: Path to the Word document
text_to_find: Text to search for
match_case: Whether to perform case-sensitive search
whole_word: Whether to match whole words only
Returns:
Dictionary with search results
"""
import os
if not os.path.exists(doc_path):
return {"error": f"Document {doc_path} does not exist"}
if not text_to_find:
return {"error": "Search text cannot be empty"}
try:
doc = Document(doc_path)
results = {
"query": text_to_find,
"match_case": match_case,
"whole_word": whole_word,
"occurrences": [],
"total_count": 0
}
# Search in paragraphs
for i, para in enumerate(doc.paragraphs):
# Prepare text for comparison
para_text = para.text
search_text = text_to_find
if not match_case:
para_text = para_text.lower()
search_text = search_text.lower()
# Find all occurrences (simple implementation)
start_pos = 0
while True:
if whole_word:
# For whole word search, we need to check word boundaries
words = para_text.split()
found = False
for word_idx, word in enumerate(words):
if (word == search_text or
(not match_case and word.lower() == search_text.lower())):
results["occurrences"].append({
"paragraph_index": i,
"position": word_idx,
"context": para.text[:100] + ("..." if len(para.text) > 100 else "")
})
results["total_count"] += 1
found = True
# Break after checking all words
break
else:
# For substring search
pos = para_text.find(search_text, start_pos)
if pos == -1:
break
results["occurrences"].append({
"paragraph_index": i,
"position": pos,
"context": para.text[:100] + ("..." if len(para.text) > 100 else "")
})
results["total_count"] += 1
start_pos = pos + len(search_text)
# Search in tables
for table_idx, table in enumerate(doc.tables):
for row_idx, row in enumerate(table.rows):
for col_idx, cell in enumerate(row.cells):
for para_idx, para in enumerate(cell.paragraphs):
# Prepare text for comparison
para_text = para.text
search_text = text_to_find
if not match_case:
para_text = para_text.lower()
search_text = search_text.lower()
# Find all occurrences (simple implementation)
start_pos = 0
while True:
if whole_word:
# For whole word search, check word boundaries
words = para_text.split()
found = False
for word_idx, word in enumerate(words):
if (word == search_text or
(not match_case and word.lower() == search_text.lower())):
results["occurrences"].append({
"location": f"Table {table_idx}, Row {row_idx}, Column {col_idx}",
"position": word_idx,
"context": para.text[:100] + ("..." if len(para.text) > 100 else "")
})
results["total_count"] += 1
found = True
# Break after checking all words
break
else:
# For substring search
pos = para_text.find(search_text, start_pos)
if pos == -1:
break
results["occurrences"].append({
"location": f"Table {table_idx}, Row {row_idx}, Column {col_idx}",
"position": pos,
"context": para.text[:100] + ("..." if len(para.text) > 100 else "")
})
results["total_count"] += 1
start_pos = pos + len(search_text)
return results
except Exception as e:
return {"error": f"Failed to search for text: {str(e)}"}