From 75baeea51123c9e0c2ca75e4b2a4cd93ca16ad14 Mon Sep 17 00:00:00 2001 From: XuYuqi Date: Fri, 25 Jul 2025 14:32:40 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- __init__.pyc | Bin 0 -> 703 bytes document_utils.py | 167 ++++++++++++++++++++++++++++++++++++ document_utils.pyc | Bin 0 -> 7870 bytes extended_document_utils.pyc | Bin 0 -> 6662 bytes 4 files changed, 167 insertions(+) create mode 100644 __init__.pyc create mode 100644 document_utils.py create mode 100644 document_utils.pyc create mode 100644 extended_document_utils.pyc diff --git a/__init__.pyc b/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be2a138f9db6b9a480f764f9a1af029f8b866a1b GIT binary patch literal 703 zcmcIhJ&)8d5Ve!+SB~R^#2u~MZT1HsL`4e)cL?Fe@;S2`>tZLGINUB35`Tb>pMdx` z)M=@%i*(&pjI+0V6qH!eljpse=b6dZ3w< zNWCv7{gVxZ-? Dict[str, Any]: + """Get properties of a Word document.""" + import os + if not os.path.exists(doc_path): + return {"error": f"Document {doc_path} does not exist"} + + try: + doc = Document(doc_path) + core_props = doc.core_properties + + return { + "title": core_props.title or "", + "author": core_props.author or "", + "subject": core_props.subject or "", + "keywords": core_props.keywords or "", + "created": str(core_props.created) if core_props.created else "", + "modified": str(core_props.modified) if core_props.modified else "", + "last_modified_by": core_props.last_modified_by or "", + "revision": core_props.revision or 0, + "page_count": len(doc.sections), + "word_count": sum(len(paragraph.text.split()) for paragraph in doc.paragraphs), + "paragraph_count": len(doc.paragraphs), + "table_count": len(doc.tables) + } + except Exception as e: + return {"error": f"Failed to get document properties: {str(e)}"} + + +def extract_document_text(doc_path: str) -> str: + """Extract all text from a Word document.""" + import os + if not os.path.exists(doc_path): + return f"Document {doc_path} does not exist" + + try: + doc = Document(doc_path) + text = [] + + for paragraph in doc.paragraphs: + text.append(paragraph.text) + + for table in doc.tables: + for row in table.rows: + for cell in row.cells: + for paragraph in cell.paragraphs: + text.append(paragraph.text) + + return "\n".join(text) + except Exception as e: + return f"Failed to extract text: {str(e)}" + + +def get_document_structure(doc_path: str) -> Dict[str, Any]: + """Get the structure of a Word document.""" + import os + if not os.path.exists(doc_path): + return {"error": f"Document {doc_path} does not exist"} + + try: + doc = Document(doc_path) + structure = { + "paragraphs": [], + "tables": [] + } + + # Get paragraphs + for i, para in enumerate(doc.paragraphs): + structure["paragraphs"].append({ + "index": i, + "text": para.text[:100] + ("..." if len(para.text) > 100 else ""), + "style": para.style.name if para.style else "Normal" + }) + + # Get tables + for i, table in enumerate(doc.tables): + table_data = { + "index": i, + "rows": len(table.rows), + "columns": len(table.columns), + "preview": [] + } + + # Get sample of table data + max_rows = min(3, len(table.rows)) + for row_idx in range(max_rows): + row_data = [] + max_cols = min(3, len(table.columns)) + for col_idx in range(max_cols): + try: + cell_text = table.cell(row_idx, col_idx).text + row_data.append(cell_text[:20] + ("..." if len(cell_text) > 20 else "")) + except IndexError: + row_data.append("N/A") + table_data["preview"].append(row_data) + + structure["tables"].append(table_data) + + return structure + except Exception as e: + return {"error": f"Failed to get document structure: {str(e)}"} + + +def find_paragraph_by_text(doc, text, partial_match=False): + """ + Find paragraphs containing specific text. + + Args: + doc: Document object + text: Text to search for + partial_match: If True, matches paragraphs containing the text; if False, matches exact text + + Returns: + List of paragraph indices that match the criteria + """ + matching_paragraphs = [] + + for i, para in enumerate(doc.paragraphs): + if partial_match and text in para.text: + matching_paragraphs.append(i) + elif not partial_match and para.text == text: + matching_paragraphs.append(i) + + return matching_paragraphs + + +def find_and_replace_text(doc, old_text, new_text): + """ + Find and replace text throughout the document. + + Args: + doc: Document object + old_text: Text to find + new_text: Text to replace with + + Returns: + Number of replacements made + """ + count = 0 + + # Search in paragraphs + for para in doc.paragraphs: + if old_text in para.text: + for run in para.runs: + if old_text in run.text: + run.text = run.text.replace(old_text, new_text) + count += 1 + + # Search in tables + for table in doc.tables: + for row in table.rows: + for cell in row.cells: + for para in cell.paragraphs: + if old_text in para.text: + for run in para.runs: + if old_text in run.text: + run.text = run.text.replace(old_text, new_text) + count += 1 + + return count diff --git a/document_utils.pyc b/document_utils.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a92428d5414d808353adbb406e3eefce5e0ce4a GIT binary patch literal 7870 zcmd5hYit|GnX}yGn-nGMMLjI8CCfG?nUW>h@vCxN$(H1#4jLJG=$a@rcO}uGNM&|u zO9GkrfZRa|1Vjh;5T&J2-a(hDh!4;KcLxEWtmqJ}vz8WvBAVp{`}^Fq zvpDX6wgcL(Z@_oLxLfcF?)RB#kKlP9O?w5)6bja?oN$dLBFP0YF7rt_8jH%y{A@BF zk)w&Y#Lp%$|7rpYyooa<;_D&~xgHU2E(cki8;eF{oqI7V$vS&Jz8qw9hk>m-ghXU! zF)YvPHZ00X9FH*OS0GdabrybKU9r%uA=`q|Op3W?Vr60E=_iFg1&q&3EXg6T0+&KR zfi1huMwuz(M6$g!uNpsXNm;IO#(0`dvDd6dYmpIb&^C?@7277YV%t%L?=;%DL53?} zJV41DspL6cSjTN(J~huAdFDKiF|Wvvx2(;``;8Vt@TESZRY^%cVDuvd7m8rk89ig@ zQ>cDsG^_TzioQ9kihd;*`wU{0Sb`(tnL=GiK4gp|bP(Z=GzVwutk_Gga^_FaZA+Ai zSYAhhCC$CdO%>lam>Xz`eI4Ckg07X$aZ%)he&RekC&tC4MLe?7GbhS3CPrz4LT6)%NH`{qgiKs=jM7N}{|$dPk+Odp z%1-olBpcie-S0fH+j%0_c~b2>nd>;EcAP4Vm&31ME(&^ds~nYMqHYT(<@p5G?NajU zH4&7D<85*I251>cw@0uTmPJ8#EF^^JY!qs>v9Ki17|ofh%en)L*P{~XC*8Feo)c#x ziDX>XU4%lh?lU)|*x{DLS7SxYm9C57XiOA%Il;pwz!?|SUuhFg^T7t)l8|(+pu6bl zOPJ^c-4{tKxZWx<95;{JBT|7Afl){B=~BgUjZe#K)%8_ zI^9MYm2~HYrHHslkb+iBbPw(#Gg)9(cbObWIwMNt8uH}94FsimkAuYnFiLch6ocz| z6Fua-Hz&!T=Ju{V_tD6QBO6C@?k?5cl}C*Ac}A;kR$4}~uK_3)-#Nu`F3;6lCz%H| zP3ud!8eXm8S6QvmuQnc8b>(fSiB}uDmB!w!eHv7o4l7MZw+6N518Q?nY3|>8S@UA+7hWYQrPy@VMfC={CD@Np0`X`3F@0 zfZ`vx_p-+KsQej)Kfi+wS_50}JZRm&5y`}It-We%FEk35@7~ZKN1&G5Kdl06omN|? z9|StHLz^SHz@Qo!e9+dhA!SxRIhQ$?Z}HZ;@~8luuE$@uBTpmQhe@U!(mfgfrF8e$ zFRp)nU1=F7#W$fis>@?|@+_GS6^Weu)@ak{IrPOj*Vtk9OK1I9C;MePL+YJYfE9X$ z95*$K{|*NQ9>ZU57HtAcjUsoO{dSW$hrke)nZ+^mMthn|v02kJ0$=8n;z}m6QZ0kN z+7EP0g9&s_r>!aLHPd85&p_IivXx9D^c+gtQ}&Wk{w}Mp9bP-Oc1EyfsaIyQT&$$n zN`9&FX)0lX@o7(k-zAqEE|hXq&LO6!Xm}j&I;i1+cbVa_uN=CtB;#;I=EJcVPxLiE zixUfb7#~cGJtj8&3FBJvmlPDtqeKV+&`Msa2Jk_G-Aw>dNw^09Xw}6DGuCi-9J)mWal29~nFhAm}V<6?~NR(s}h{o|Wn#orUQ-M`q)G=$44?pn9gH zYZ{B}l?(~t6Q2f5!kyIrf6z3?HQ(^YNyRr@ERNwk=d%t~Y8YSLD)+FZJ<|cw=sZHf z>ZInW{pf`cU)XSFYI2^#s^_pm%fm+ht?4}CoWm5XUeKD`Z>{7J>xB*g>#Wuu$V}#u zr4Bj(tdD8k!Oa<<+6Wy0Hk_Kj{ip69yR(yaxPTc@X_*z z%Nq?JytnqA;^=t%z})~OtV1QB`3C>FYb*ToLw61-zCltP)y2>jLx!^W^!x7b529ZW zx<>u%ug}(v*0W!D8K{3zZv|MPEXb))WwFG4M`iJ6`_OY}4hgo5CC$pFk4ybcQ(;sJ z4TyC3N8p^KIZy|zU@ys6mR#efo)ia4!&D=JMR5F0Sr8YSC2a!H$ha52KurF0g4$kZMX(yNk*Eg=T z2bWbVxhqz3SAG$!w3PNZQOcGjYUrsa6Pw9+M9CiaepAh6?}W%8E+*Ig74{@ zQi%ble+A-{&JufID*O!EYki@%dj9KZEosCiDZ5bjtrFm>rm*%0keq$rI?rA`Z=G42 zOo0;s{1f9*d}y3i194WpOpHSDN!3)&u86a0-mRhL4&G(vu3@E{#98vZ$b-|Hj6kR_ z?j34jB4#C%7qn0dMvWG&JrZDig$;#5m|QO1b~%9;!ZC<`tkJk2E>T+r;g7r=6N4Nc zA=HKmqCJv`B^SW2wl9(xLA>#ZsK!UcFJ56U_n*h-2`Vv;E1e}9TM-19?2{PaS=Lr} zigAb=F@z}vwn+$Bu%5)Z$L9d(Ts#aYWFg8Jex7u)3(>f4#o_pzh)(n*4by+Xx;x zfK}($9^^let>5g(H4muG19@b1`Y8BdeD(aQta%#OW2&d?zNcr`)1$Stg7wZgvj?+7 z*@GG1zm4v2pF98M^1oibJGDDJp$t!K9p5^B?`qEfL#=P%&LyQ~Y@;F5nh9(Kb|&(6 zuCE5%hQ69VA3z4RP`Utqekb@p9uJ10{BQiY-xSqZ`-(# zx%7!I<0H{UGLz1=9fjaVYulf9T3dlMfYnQ1*CIeX1`%}kX3r;yOrj9{tWG>=?BHxsdA|Ry&7tZO7HN<1i37n0@V&_cHGR?Ye#n*2nV} zuCZS`82s6#f4sDHW5=F5_`FKW1>)N9KlT0Cx8>aK+8*5Qx>NI;p}Wh!8u@ZWx$?^H z=qt+TD?8mg-O7*N%pIA@`QK9gZz;4qe$duUydzk`00t>*&L2|!A;l=fP3i|<=3#B? z#woS7`+jZTZf)PzYq{DJYV8SymWOa?^QtFs-*af!b11u#^9-t>Lf{nsW-5Pstq7MCrKVN(CnC-U% z3^aawEHGa0{9UaD>c88^kb3?1pnk8;HF1i)*H%9<%-#zzq&{o~crij0it!N7g8E)x zLgi!dhQJd7kJorosFcU7)Tm(ZCQMJrf&(cAyb#kWUq`qJ!mjd_43a2Sl>-BtbSPz! z`11KF=50y~H~tw^sC5W3;p6ZDE+YRILDll5%mg1v#N}`_9*xiO(xMoFpXMiL z`71an_VKh0Nx5fpMhp-keTI+D@)seERW?IhBGDKbH5Z|kmuX_UjB%36CdSKLf`_e$ zMqp8SJ}ehFpr$T@qhJK0VM+!J(HI<4!6wjPiWr8VtrT79-inXX>*=?^hC+tiNC4qW zgvJX1B;pg3eKZm~O|!s82^BJ71@-+Z;3dd}zXos<<^8B_zrr>DSAF}&ROZcGeV%b(Eu8Bn-xYDkflQ$eWO~G8IKgg(TzD<-#Nl zPMx=BrbDQN!WPdImn#V9TQKKu;U_f#cT_;VH8(FnwzRI{7V!wI0SZ<*&0W7P-&$7O zJQS_D?bb3$SimR%$Xe95LAr8DsXq)w3v_0d?g#pJ1O3~Mog=xxs2UhuAH6l9HMDH} z@b50IUw{Op4ee{Z)tW~RYcB<>R;}KD>vSI3ozMYbbyV}WWt@3r^FjxJbxv#NGcOS@ z0y+RdZlpOtL|?Qor$~i zx!#x5-j}lMrbFYqx9a}Em9+v>ZbF(R3$lcbeaKUH^F^vPg3QHWjc%VI*`%2n$XH2` z;Ue5a5a9Ec_cG z852kFEHuegk)8kmcZFf{77N2cItYLbF`k=l@~5FrW$&V)7NxS}Z7s|sla1sNd2b2Z zLp$vD$vg2v`x)?LBQum81xJz@+A6$gy$lTWGZRd9Dv!u}>&kZ5wzPfZ&YOkyGoZjU TFmPikLFfQemq$+nx;Os|=CLU} literal 0 HcmV?d00001 diff --git a/extended_document_utils.pyc b/extended_document_utils.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c728e27142ceb1185cd66353a0786752b7b746f GIT binary patch literal 6662 zcmc&2TTC3+b!KO0*(YGLycax}hj$kY*w|pO&BK6g0=8o>tpm2T8Sf0RW*>BBFpumy zu9`?RidyohN)t!fMs7kIt09%6;>uA;lS;1YC^M^#7>(o#sUMZ4eyo*7a{SYtJIgM+ zjvG;{X=m@A`#R^|bIv{I-h1vl7K;&s@9we7WBsKV_E%I%ewOvZ7nh;%Q;fleF$QNe zKHRSv*7%8G!ml0H`pIDuN9}}9=g%6>g0hy?GbC$ZbgYKev)U0&adJd#M8{-3ryVvj zIVzSBFiot9F~k2+*ha?k964-evR#-XcecgYJ1wvQhGi%w=$Z7h0fCwnygsksWqE2O z$Wf<*97FXa`&=w{k>%=*chU3?t)lJmdIUu~;N=B{=nBjz{x9Cb@t|ladC*vY<@B|BJuis)~?$u&^eixd8RV7fHuY zHew;{ZNPXpg~jU`g3*Ss*D;2CgYe=m%zz1K^!4vy{4f#18GKyr<%>eNU`S$OSctel zCTp3qjH~Tj8O)^%;V@U$=I8)Z{|1S;N*_CfZ!BzB8^S}{3%caIP)>2rnbT_*^hulm z&KS~rxNYgR4ZKL2Y)M)SV~m-y!DwJeSF|py5)-T&cvToz9wtFzi%_`H8X_~PU&r3i zcyW(r2xBy1@)GG<&!_MyY?>Itrf^3#t%Kz?Cs>bn#Ot9_ zHbZ#>(5~8J(pKwLQWbP@qkNlMLZc;3u8n#U4383mlrY9-(xz}z6C{kXjp{?TNd-q# znp7_u1o-1z(d+VNK87wP<~dR8Csy# zQ8;AO#}$(0xFGkykaQpFK@La*&KL*^Fh4ljY;j-UBJJ}sX+x^+)JDzb)NlCe7|m0& z<_#piUeO&7a(>#UWF2K`#v2$t3@%J2Tv3aZuaLYj<6{-0mv@gPP-e>y(_SC!445v;o==v+x4V2|Ut{V(6Xr zuoFmO3c(AUL&qVT=8&0k`G|yoB~=Wm(G;BJk!PUArlh#qVnKllB!;Nan`us)>U*we#?5FR#$G`u}z zh@^J+q|hjT8~*R68UWw8DLf;ChfM}JTeurAa_W3Y4fOSq&5R*z$&kmKxu5r7=U!H2 z$PVX(aze&&^%Umofcxh|Ih)~XbG#?U<<5k3jB(>MeDsMG$_izt>yyzirsvZxM*`YA z*1vSTOvxD2xN2*hhB3pLXPE^GCi@DJiG^dpQO>kxI&FL=cNQ$uLwGa?5RE=;dGVFN z3!rgRa@O=bWDJ?!2CcoTY7X^;b3?hzmUW5ofPa%R=cdc@|CKqEo1x=}ZV*V5VbeVR zVlVuEkYQcq1#?n2nR-^eFuY%+kXQ8_u>C3{O^WzY79}H^h69(V~8R+ zcMF7yjn+_6=3bByLlj*qQez0n(_)B1+LeqU4#RDNqJ2Ip=2Id2JPh801q+v zMRt9F6!FIOD1%rXlTte=baw)=1HPI6AUav+lVkZwpTMUDBplkMIkfF^Xdj&|btNQH zRq3GvC@eY8QmlVMm~oH_`MpV&o8hVuKDrSqrnI$COhQngeQr;1G9U~d=InrQmqDWv z_4V}%<{;DyzoMTABIzL($npdOXeQ2qP|;QI?myw1)eX|;eJnMrtD{Z@r>I#&9n~H5 zP5J|rgHZIopod1}TpbW_JJAHC8&S;0lBfYEgfLsw5Q8f~*eIY)#4iRm01qM@EwCKA zhN$6-Li&PJEC+GY1iXJDGXXZ0arYdUk$V#1vuotiUCQl za>n&T-HFHo@HPU$Rmo!pS8Jw16qoE%1CnMjr$yAa#I>V1*_9b_t>st9B~>VumdT~I zxJj1>VGe+~tS`4>MHP{}tL0K*W2~?-jyVi1_YnBuk-4rpVXeq^#W2?=m(G6=e$8KcdV8-M9Ukb@?Ej=UGpXg!-`Ah`r?{$ON)$X%C@wu zWzh7>wu<@QJI+mR-mHJEezA1PE0uJ{N;>De<-(GM z^DocL&&Wk(Yu3_*V==39)w(Nc-6hx7$yGb#${kl*7Y7$Ti-T7?KIB*SeA0U3(#=ca zldfooOYCqhu}iEtbVjN?3k)_D07p#)z+iqMFqjWZkAM-&w=EnKtxiZV{30z`8xl3i z>XfO>#p;dzS_|q&@Fnwn)iR|g+Kg1)8mn%NW6mw88^M>3%^z79L~Gk~{ef9f7 z?w6bgW6pyM{h;CMngs)xRdF{iBcighZhS4BDniQkoe}a*bH@rPH6M;O9|oJM0OP-q`|%R^;QC^xdp#bP51>DG=!E}r!y3UXtU+|YDwX4$%2xoo}m?59n)gxfQJ zYQNnso^?n2-D1CcrFx}WWJjfjG2p#pJFwcZ9XPM31kNkqDF^{0R$*VXh*UEqn0L8C zqV^|hV1ja8!v~d1oa4!4)9g`L9 zb6x5yOcJV_UdIRg@~-zUUcV?7cc7HhDe50i$yb;z^%VvkBELRfccK*gtkn1dya$iu#W9pG@ris{Sn0l%w{rC6r9}M^um^tz$B}B)Y=KxW nEH