上传文件至 /
This commit is contained in:
parent
57bdaca9b8
commit
75baeea511
Binary file not shown.
|
@ -0,0 +1,167 @@
|
|||
"""
|
||||
Document utility functions for Word Document Server.
|
||||
"""
|
||||
import json
|
||||
from typing import Dict, List, Any
|
||||
from docx import Document
|
||||
|
||||
|
||||
def get_document_properties(doc_path: str) -> Dict[str, Any]:
|
||||
"""Get properties of a Word document."""
|
||||
import os
|
||||
if not os.path.exists(doc_path):
|
||||
return {"error": f"Document {doc_path} does not exist"}
|
||||
|
||||
try:
|
||||
doc = Document(doc_path)
|
||||
core_props = doc.core_properties
|
||||
|
||||
return {
|
||||
"title": core_props.title or "",
|
||||
"author": core_props.author or "",
|
||||
"subject": core_props.subject or "",
|
||||
"keywords": core_props.keywords or "",
|
||||
"created": str(core_props.created) if core_props.created else "",
|
||||
"modified": str(core_props.modified) if core_props.modified else "",
|
||||
"last_modified_by": core_props.last_modified_by or "",
|
||||
"revision": core_props.revision or 0,
|
||||
"page_count": len(doc.sections),
|
||||
"word_count": sum(len(paragraph.text.split()) for paragraph in doc.paragraphs),
|
||||
"paragraph_count": len(doc.paragraphs),
|
||||
"table_count": len(doc.tables)
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": f"Failed to get document properties: {str(e)}"}
|
||||
|
||||
|
||||
def extract_document_text(doc_path: str) -> str:
|
||||
"""Extract all text from a Word document."""
|
||||
import os
|
||||
if not os.path.exists(doc_path):
|
||||
return f"Document {doc_path} does not exist"
|
||||
|
||||
try:
|
||||
doc = Document(doc_path)
|
||||
text = []
|
||||
|
||||
for paragraph in doc.paragraphs:
|
||||
text.append(paragraph.text)
|
||||
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
for paragraph in cell.paragraphs:
|
||||
text.append(paragraph.text)
|
||||
|
||||
return "\n".join(text)
|
||||
except Exception as e:
|
||||
return f"Failed to extract text: {str(e)}"
|
||||
|
||||
|
||||
def get_document_structure(doc_path: str) -> Dict[str, Any]:
|
||||
"""Get the structure of a Word document."""
|
||||
import os
|
||||
if not os.path.exists(doc_path):
|
||||
return {"error": f"Document {doc_path} does not exist"}
|
||||
|
||||
try:
|
||||
doc = Document(doc_path)
|
||||
structure = {
|
||||
"paragraphs": [],
|
||||
"tables": []
|
||||
}
|
||||
|
||||
# Get paragraphs
|
||||
for i, para in enumerate(doc.paragraphs):
|
||||
structure["paragraphs"].append({
|
||||
"index": i,
|
||||
"text": para.text[:100] + ("..." if len(para.text) > 100 else ""),
|
||||
"style": para.style.name if para.style else "Normal"
|
||||
})
|
||||
|
||||
# Get tables
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_data = {
|
||||
"index": i,
|
||||
"rows": len(table.rows),
|
||||
"columns": len(table.columns),
|
||||
"preview": []
|
||||
}
|
||||
|
||||
# Get sample of table data
|
||||
max_rows = min(3, len(table.rows))
|
||||
for row_idx in range(max_rows):
|
||||
row_data = []
|
||||
max_cols = min(3, len(table.columns))
|
||||
for col_idx in range(max_cols):
|
||||
try:
|
||||
cell_text = table.cell(row_idx, col_idx).text
|
||||
row_data.append(cell_text[:20] + ("..." if len(cell_text) > 20 else ""))
|
||||
except IndexError:
|
||||
row_data.append("N/A")
|
||||
table_data["preview"].append(row_data)
|
||||
|
||||
structure["tables"].append(table_data)
|
||||
|
||||
return structure
|
||||
except Exception as e:
|
||||
return {"error": f"Failed to get document structure: {str(e)}"}
|
||||
|
||||
|
||||
def find_paragraph_by_text(doc, text, partial_match=False):
|
||||
"""
|
||||
Find paragraphs containing specific text.
|
||||
|
||||
Args:
|
||||
doc: Document object
|
||||
text: Text to search for
|
||||
partial_match: If True, matches paragraphs containing the text; if False, matches exact text
|
||||
|
||||
Returns:
|
||||
List of paragraph indices that match the criteria
|
||||
"""
|
||||
matching_paragraphs = []
|
||||
|
||||
for i, para in enumerate(doc.paragraphs):
|
||||
if partial_match and text in para.text:
|
||||
matching_paragraphs.append(i)
|
||||
elif not partial_match and para.text == text:
|
||||
matching_paragraphs.append(i)
|
||||
|
||||
return matching_paragraphs
|
||||
|
||||
|
||||
def find_and_replace_text(doc, old_text, new_text):
|
||||
"""
|
||||
Find and replace text throughout the document.
|
||||
|
||||
Args:
|
||||
doc: Document object
|
||||
old_text: Text to find
|
||||
new_text: Text to replace with
|
||||
|
||||
Returns:
|
||||
Number of replacements made
|
||||
"""
|
||||
count = 0
|
||||
|
||||
# Search in paragraphs
|
||||
for para in doc.paragraphs:
|
||||
if old_text in para.text:
|
||||
for run in para.runs:
|
||||
if old_text in run.text:
|
||||
run.text = run.text.replace(old_text, new_text)
|
||||
count += 1
|
||||
|
||||
# Search in tables
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
for para in cell.paragraphs:
|
||||
if old_text in para.text:
|
||||
for run in para.runs:
|
||||
if old_text in run.text:
|
||||
run.text = run.text.replace(old_text, new_text)
|
||||
count += 1
|
||||
|
||||
return count
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue