Report_Generate_Server/utils/document_utils.py

"""
Document utility functions for Word Document Server.
"""
import json
from typing import Dict, List, Any
from docx import Document


def get_document_properties(doc_path: str) -> Dict[str, Any]:
    """Get properties of a Word document."""
    import os
    if not os.path.exists(doc_path):
        return {"error": f"Document {doc_path} does not exist"}
    
    try:
        doc = Document(doc_path)
        core_props = doc.core_properties
        
        return {
            "title": core_props.title or "",
            "author": core_props.author or "",
            "subject": core_props.subject or "",
            "keywords": core_props.keywords or "",
            "created": str(core_props.created) if core_props.created else "",
            "modified": str(core_props.modified) if core_props.modified else "",
            "last_modified_by": core_props.last_modified_by or "",
            "revision": core_props.revision or 0,
            "page_count": len(doc.sections),
            "word_count": sum(len(paragraph.text.split()) for paragraph in doc.paragraphs),
            "paragraph_count": len(doc.paragraphs),
            "table_count": len(doc.tables)
        }
    except Exception as e:
        return {"error": f"Failed to get document properties: {str(e)}"}


def extract_document_text(doc_path: str) -> str:
    """Extract all text from a Word document."""
    import os
    if not os.path.exists(doc_path):
        return f"Document {doc_path} does not exist"
    
    try:
        doc = Document(doc_path)
        text = []
        
        for paragraph in doc.paragraphs:
            text.append(paragraph.text)
            
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    for paragraph in cell.paragraphs:
                        text.append(paragraph.text)
        
        return "\n".join(text)
    except Exception as e:
        return f"Failed to extract text: {str(e)}"


def get_document_structure(doc_path: str) -> Dict[str, Any]:
    """Get the structure of a Word document."""
    import os
    if not os.path.exists(doc_path):
        return {"error": f"Document {doc_path} does not exist"}
    
    try:
        doc = Document(doc_path)
        structure = {
            "paragraphs": [],
            "tables": []
        }
        
        # Get paragraphs
        for i, para in enumerate(doc.paragraphs):
            structure["paragraphs"].append({
                "index": i,
                "text": para.text[:100] + ("..." if len(para.text) > 100 else ""),
                "style": para.style.name if para.style else "Normal"
            })
        
        # Get tables
        for i, table in enumerate(doc.tables):
            table_data = {
                "index": i,
                "rows": len(table.rows),
                "columns": len(table.columns),
                "preview": []
            }
            
            # Get sample of table data
            max_rows = min(3, len(table.rows))
            for row_idx in range(max_rows):
                row_data = []
                max_cols = min(3, len(table.columns))
                for col_idx in range(max_cols):
                    try:
                        cell_text = table.cell(row_idx, col_idx).text
                        row_data.append(cell_text[:20] + ("..." if len(cell_text) > 20 else ""))
                    except IndexError:
                        row_data.append("N/A")
                table_data["preview"].append(row_data)
            
            structure["tables"].append(table_data)
        
        return structure
    except Exception as e:
        return {"error": f"Failed to get document structure: {str(e)}"}


def find_paragraph_by_text(doc, text, partial_match=False):
    """
    Find paragraphs containing specific text.
    
    Args:
        doc: Document object
        text: Text to search for
        partial_match: If True, matches paragraphs containing the text; if False, matches exact text
        
    Returns:
        List of paragraph indices that match the criteria
    """
    matching_paragraphs = []
    
    for i, para in enumerate(doc.paragraphs):
        if partial_match and text in para.text:
            matching_paragraphs.append(i)
        elif not partial_match and para.text == text:
            matching_paragraphs.append(i)
            
    return matching_paragraphs


def find_and_replace_text(doc, old_text, new_text):
    """
    Find and replace text throughout the document.
    
    Args:
        doc: Document object
        old_text: Text to find
        new_text: Text to replace with
        
    Returns:
        Number of replacements made
    """
    count = 0
    
    # Search in paragraphs
    for para in doc.paragraphs:
        if old_text in para.text:
            for run in para.runs:
                if old_text in run.text:
                    run.text = run.text.replace(old_text, new_text)
                    count += 1
    
    # Search in tables
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for para in cell.paragraphs:
                    if old_text in para.text:
                        for run in para.runs:
                            if old_text in run.text:
                                run.text = run.text.replace(old_text, new_text)
                                count += 1
    
    return count
完成部分数据库的数据拉取代码 2025-07-03 17:50:42 +08:00			`"""`
			`Document utility functions for Word Document Server.`
			`"""`
			`import json`
			`from typing import Dict, List, Any`
			`from docx import Document`


			`def get_document_properties(doc_path: str) -> Dict[str, Any]:`
			`"""Get properties of a Word document."""`
			`import os`
			`if not os.path.exists(doc_path):`
			`return {"error": f"Document {doc_path} does not exist"}`

			`try:`
			`doc = Document(doc_path)`
			`core_props = doc.core_properties`

			`return {`
			`"title": core_props.title or "",`
			`"author": core_props.author or "",`
			`"subject": core_props.subject or "",`
			`"keywords": core_props.keywords or "",`
			`"created": str(core_props.created) if core_props.created else "",`
			`"modified": str(core_props.modified) if core_props.modified else "",`
			`"last_modified_by": core_props.last_modified_by or "",`
			`"revision": core_props.revision or 0,`
			`"page_count": len(doc.sections),`
			`"word_count": sum(len(paragraph.text.split()) for paragraph in doc.paragraphs),`
			`"paragraph_count": len(doc.paragraphs),`
			`"table_count": len(doc.tables)`
			`}`
			`except Exception as e:`
			`return {"error": f"Failed to get document properties: {str(e)}"}`


			`def extract_document_text(doc_path: str) -> str:`
			`"""Extract all text from a Word document."""`
			`import os`
			`if not os.path.exists(doc_path):`
			`return f"Document {doc_path} does not exist"`

			`try:`
			`doc = Document(doc_path)`
			`text = []`

			`for paragraph in doc.paragraphs:`
			`text.append(paragraph.text)`

			`for table in doc.tables:`
			`for row in table.rows:`
			`for cell in row.cells:`
			`for paragraph in cell.paragraphs:`
			`text.append(paragraph.text)`

			`return "\n".join(text)`
			`except Exception as e:`
			`return f"Failed to extract text: {str(e)}"`


			`def get_document_structure(doc_path: str) -> Dict[str, Any]:`
			`"""Get the structure of a Word document."""`
			`import os`
			`if not os.path.exists(doc_path):`
			`return {"error": f"Document {doc_path} does not exist"}`

			`try:`
			`doc = Document(doc_path)`
			`structure = {`
			`"paragraphs": [],`
			`"tables": []`
			`}`

			`# Get paragraphs`
			`for i, para in enumerate(doc.paragraphs):`
			`structure["paragraphs"].append({`
			`"index": i,`
			`"text": para.text[:100] + ("..." if len(para.text) > 100 else ""),`
			`"style": para.style.name if para.style else "Normal"`
			`})`

			`# Get tables`
			`for i, table in enumerate(doc.tables):`
			`table_data = {`
			`"index": i,`
			`"rows": len(table.rows),`
			`"columns": len(table.columns),`
			`"preview": []`
			`}`

			`# Get sample of table data`
			`max_rows = min(3, len(table.rows))`
			`for row_idx in range(max_rows):`
			`row_data = []`
			`max_cols = min(3, len(table.columns))`
			`for col_idx in range(max_cols):`
			`try:`
			`cell_text = table.cell(row_idx, col_idx).text`
			`row_data.append(cell_text[:20] + ("..." if len(cell_text) > 20 else ""))`
			`except IndexError:`
			`row_data.append("N/A")`
			`table_data["preview"].append(row_data)`

			`structure["tables"].append(table_data)`

			`return structure`
			`except Exception as e:`
			`return {"error": f"Failed to get document structure: {str(e)}"}`


			`def find_paragraph_by_text(doc, text, partial_match=False):`
			`"""`
			`Find paragraphs containing specific text.`

			`Args:`
			`doc: Document object`
			`text: Text to search for`
			`partial_match: If True, matches paragraphs containing the text; if False, matches exact text`

			`Returns:`
			`List of paragraph indices that match the criteria`
			`"""`
			`matching_paragraphs = []`

			`for i, para in enumerate(doc.paragraphs):`
			`if partial_match and text in para.text:`
			`matching_paragraphs.append(i)`
			`elif not partial_match and para.text == text:`
			`matching_paragraphs.append(i)`

			`return matching_paragraphs`


			`def find_and_replace_text(doc, old_text, new_text):`
			`"""`
			`Find and replace text throughout the document.`

			`Args:`
			`doc: Document object`
			`old_text: Text to find`
			`new_text: Text to replace with`

			`Returns:`
			`Number of replacements made`
			`"""`
			`count = 0`

			`# Search in paragraphs`
			`for para in doc.paragraphs:`
			`if old_text in para.text:`
			`for run in para.runs:`
			`if old_text in run.text:`
			`run.text = run.text.replace(old_text, new_text)`
			`count += 1`

			`# Search in tables`
			`for table in doc.tables:`
			`for row in table.rows:`
			`for cell in row.cells:`
			`for para in cell.paragraphs:`
			`if old_text in para.text:`
			`for run in para.runs:`
			`if old_text in run.text:`
			`run.text = run.text.replace(old_text, new_text)`
			`count += 1`

			`return count`