Report_Generate_Server/utils/extended_document_utils.py

"""
Extended document utilities for Word Document Server.
"""
from typing import Dict, List, Any, Tuple
from docx import Document


def get_paragraph_text(doc_path: str, paragraph_index: int) -> Dict[str, Any]:
    """
    Get text from a specific paragraph in a Word document.
    
    Args:
        doc_path: Path to the Word document
        paragraph_index: Index of the paragraph to extract (0-based)
    
    Returns:
        Dictionary with paragraph text and metadata
    """
    import os
    if not os.path.exists(doc_path):
        return {"error": f"Document {doc_path} does not exist"}
    
    try:
        doc = Document(doc_path)
        
        # Check if paragraph index is valid
        if paragraph_index < 0 or paragraph_index >= len(doc.paragraphs):
            return {"error": f"Invalid paragraph index: {paragraph_index}. Document has {len(doc.paragraphs)} paragraphs."}
        
        paragraph = doc.paragraphs[paragraph_index]
        
        return {
            "index": paragraph_index,
            "text": paragraph.text,
            "style": paragraph.style.name if paragraph.style else "Normal",
            "is_heading": paragraph.style.name.startswith("Heading") if paragraph.style else False
        }
    except Exception as e:
        return {"error": f"Failed to get paragraph text: {str(e)}"}


def find_text(doc_path: str, text_to_find: str, match_case: bool = True, whole_word: bool = False) -> Dict[str, Any]:
    """
    Find all occurrences of specific text in a Word document.
    
    Args:
        doc_path: Path to the Word document
        text_to_find: Text to search for
        match_case: Whether to perform case-sensitive search
        whole_word: Whether to match whole words only
    
    Returns:
        Dictionary with search results
    """
    import os
    if not os.path.exists(doc_path):
        return {"error": f"Document {doc_path} does not exist"}
    
    if not text_to_find:
        return {"error": "Search text cannot be empty"}
    
    try:
        doc = Document(doc_path)
        results = {
            "query": text_to_find,
            "match_case": match_case,
            "whole_word": whole_word,
            "occurrences": [],
            "total_count": 0
        }
        
        # Search in paragraphs
        for i, para in enumerate(doc.paragraphs):
            # Prepare text for comparison
            para_text = para.text
            search_text = text_to_find
            
            if not match_case:
                para_text = para_text.lower()
                search_text = search_text.lower()
            
            # Find all occurrences (simple implementation)
            start_pos = 0
            while True:
                if whole_word:
                    # For whole word search, we need to check word boundaries
                    words = para_text.split()
                    found = False
                    for word_idx, word in enumerate(words):
                        if (word == search_text or 
                            (not match_case and word.lower() == search_text.lower())):
                            results["occurrences"].append({
                                "paragraph_index": i,
                                "position": word_idx,
                                "context": para.text[:100] + ("..." if len(para.text) > 100 else "")
                            })
                            results["total_count"] += 1
                            found = True
                    
                    # Break after checking all words
                    break
                else:
                    # For substring search
                    pos = para_text.find(search_text, start_pos)
                    if pos == -1:
                        break
                    
                    results["occurrences"].append({
                        "paragraph_index": i,
                        "position": pos,
                        "context": para.text[:100] + ("..." if len(para.text) > 100 else "")
                    })
                    results["total_count"] += 1
                    start_pos = pos + len(search_text)
        
        # Search in tables
        for table_idx, table in enumerate(doc.tables):
            for row_idx, row in enumerate(table.rows):
                for col_idx, cell in enumerate(row.cells):
                    for para_idx, para in enumerate(cell.paragraphs):
                        # Prepare text for comparison
                        para_text = para.text
                        search_text = text_to_find
                        
                        if not match_case:
                            para_text = para_text.lower()
                            search_text = search_text.lower()
                        
                        # Find all occurrences (simple implementation)
                        start_pos = 0
                        while True:
                            if whole_word:
                                # For whole word search, check word boundaries
                                words = para_text.split()
                                found = False
                                for word_idx, word in enumerate(words):
                                    if (word == search_text or 
                                        (not match_case and word.lower() == search_text.lower())):
                                        results["occurrences"].append({
                                            "location": f"Table {table_idx}, Row {row_idx}, Column {col_idx}",
                                            "position": word_idx,
                                            "context": para.text[:100] + ("..." if len(para.text) > 100 else "")
                                        })
                                        results["total_count"] += 1
                                        found = True
                                
                                # Break after checking all words
                                break
                            else:
                                # For substring search
                                pos = para_text.find(search_text, start_pos)
                                if pos == -1:
                                    break
                                
                                results["occurrences"].append({
                                    "location": f"Table {table_idx}, Row {row_idx}, Column {col_idx}",
                                    "position": pos,
                                    "context": para.text[:100] + ("..." if len(para.text) > 100 else "")
                                })
                                results["total_count"] += 1
                                start_pos = pos + len(search_text)
        
        return results
    except Exception as e:
        return {"error": f"Failed to search for text: {str(e)}"}
完成部分数据库的数据拉取代码 2025-07-03 17:50:42 +08:00			`"""`
			`Extended document utilities for Word Document Server.`
			`"""`
			`from typing import Dict, List, Any, Tuple`
			`from docx import Document`


			`def get_paragraph_text(doc_path: str, paragraph_index: int) -> Dict[str, Any]:`
			`"""`
			`Get text from a specific paragraph in a Word document.`

			`Args:`
			`doc_path: Path to the Word document`
			`paragraph_index: Index of the paragraph to extract (0-based)`

			`Returns:`
			`Dictionary with paragraph text and metadata`
			`"""`
			`import os`
			`if not os.path.exists(doc_path):`
			`return {"error": f"Document {doc_path} does not exist"}`

			`try:`
			`doc = Document(doc_path)`

			`# Check if paragraph index is valid`
			`if paragraph_index < 0 or paragraph_index >= len(doc.paragraphs):`
			`return {"error": f"Invalid paragraph index: {paragraph_index}. Document has {len(doc.paragraphs)} paragraphs."}`

			`paragraph = doc.paragraphs[paragraph_index]`

			`return {`
			`"index": paragraph_index,`
			`"text": paragraph.text,`
			`"style": paragraph.style.name if paragraph.style else "Normal",`
			`"is_heading": paragraph.style.name.startswith("Heading") if paragraph.style else False`
			`}`
			`except Exception as e:`
			`return {"error": f"Failed to get paragraph text: {str(e)}"}`


			`def find_text(doc_path: str, text_to_find: str, match_case: bool = True, whole_word: bool = False) -> Dict[str, Any]:`
			`"""`
			`Find all occurrences of specific text in a Word document.`

			`Args:`
			`doc_path: Path to the Word document`
			`text_to_find: Text to search for`
			`match_case: Whether to perform case-sensitive search`
			`whole_word: Whether to match whole words only`

			`Returns:`
			`Dictionary with search results`
			`"""`
			`import os`
			`if not os.path.exists(doc_path):`
			`return {"error": f"Document {doc_path} does not exist"}`

			`if not text_to_find:`
			`return {"error": "Search text cannot be empty"}`

			`try:`
			`doc = Document(doc_path)`
			`results = {`
			`"query": text_to_find,`
			`"match_case": match_case,`
			`"whole_word": whole_word,`
			`"occurrences": [],`
			`"total_count": 0`
			`}`

			`# Search in paragraphs`
			`for i, para in enumerate(doc.paragraphs):`
			`# Prepare text for comparison`
			`para_text = para.text`
			`search_text = text_to_find`

			`if not match_case:`
			`para_text = para_text.lower()`
			`search_text = search_text.lower()`

			`# Find all occurrences (simple implementation)`
			`start_pos = 0`
			`while True:`
			`if whole_word:`
			`# For whole word search, we need to check word boundaries`
			`words = para_text.split()`
			`found = False`
			`for word_idx, word in enumerate(words):`
			`if (word == search_text or`
			`(not match_case and word.lower() == search_text.lower())):`
			`results["occurrences"].append({`
			`"paragraph_index": i,`
			`"position": word_idx,`
			`"context": para.text[:100] + ("..." if len(para.text) > 100 else "")`
			`})`
			`results["total_count"] += 1`
			`found = True`

			`# Break after checking all words`
			`break`
			`else:`
			`# For substring search`
			`pos = para_text.find(search_text, start_pos)`
			`if pos == -1:`
			`break`

			`results["occurrences"].append({`
			`"paragraph_index": i,`
			`"position": pos,`
			`"context": para.text[:100] + ("..." if len(para.text) > 100 else "")`
			`})`
			`results["total_count"] += 1`
			`start_pos = pos + len(search_text)`

			`# Search in tables`
			`for table_idx, table in enumerate(doc.tables):`
			`for row_idx, row in enumerate(table.rows):`
			`for col_idx, cell in enumerate(row.cells):`
			`for para_idx, para in enumerate(cell.paragraphs):`
			`# Prepare text for comparison`
			`para_text = para.text`
			`search_text = text_to_find`

			`if not match_case:`
			`para_text = para_text.lower()`
			`search_text = search_text.lower()`

			`# Find all occurrences (simple implementation)`
			`start_pos = 0`
			`while True:`
			`if whole_word:`
			`# For whole word search, check word boundaries`
			`words = para_text.split()`
			`found = False`
			`for word_idx, word in enumerate(words):`
			`if (word == search_text or`
			`(not match_case and word.lower() == search_text.lower())):`
			`results["occurrences"].append({`
			`"location": f"Table {table_idx}, Row {row_idx}, Column {col_idx}",`
			`"position": word_idx,`
			`"context": para.text[:100] + ("..." if len(para.text) > 100 else "")`
			`})`
			`results["total_count"] += 1`
			`found = True`

			`# Break after checking all words`
			`break`
			`else:`
			`# For substring search`
			`pos = para_text.find(search_text, start_pos)`
			`if pos == -1:`
			`break`

			`results["occurrences"].append({`
			`"location": f"Table {table_idx}, Row {row_idx}, Column {col_idx}",`
			`"position": pos,`
			`"context": para.text[:100] + ("..." if len(para.text) > 100 else "")`
			`})`
			`results["total_count"] += 1`
			`start_pos = pos + len(search_text)`

			`return results`
			`except Exception as e:`
			`return {"error": f"Failed to search for text: {str(e)}"}`