2025-03-24 11:19:28 +08:00
|
|
|
|
#
|
|
|
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
from docx import Document
|
|
|
|
|
import re
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from collections import Counter
|
|
|
|
|
from rag.nlp import rag_tokenizer
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RAGFlowDocxParser:
|
2025-04-05 22:04:05 +08:00
|
|
|
|
"""
|
|
|
|
|
Word文档(.docx)解析器,用于提取文档中的文本内容和表格。
|
|
|
|
|
|
|
|
|
|
该解析器能够:
|
|
|
|
|
1. 按页面范围提取文档中的段落文本及其样式
|
|
|
|
|
2. 识别文档中的表格并将其转换为结构化文本
|
|
|
|
|
3. 智能处理表格头部和内容,生成语义化的文本描述
|
|
|
|
|
"""
|
2025-03-24 11:19:28 +08:00
|
|
|
|
|
|
|
|
|
def __extract_table_content(self, tb):
|
2025-04-05 22:04:05 +08:00
|
|
|
|
"""
|
|
|
|
|
从Word表格对象中提取内容并转换为DataFrame
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
tb: docx库的Table对象
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
处理后的表格内容文本列表
|
|
|
|
|
"""
|
2025-03-24 11:19:28 +08:00
|
|
|
|
df = []
|
|
|
|
|
for row in tb.rows:
|
|
|
|
|
df.append([c.text for c in row.cells])
|
|
|
|
|
return self.__compose_table_content(pd.DataFrame(df))
|
|
|
|
|
|
|
|
|
|
def __compose_table_content(self, df):
|
2025-04-05 22:04:05 +08:00
|
|
|
|
"""
|
|
|
|
|
将表格DataFrame转换为语义化的文本描述
|
|
|
|
|
|
|
|
|
|
通过识别表格的结构特征(如表头、数据类型等),将表格转换为更易于理解的文本形式
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
df: 包含表格内容的DataFrame
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
表格内容的文本表示列表
|
|
|
|
|
"""
|
2025-03-24 11:19:28 +08:00
|
|
|
|
|
|
|
|
|
def blockType(b):
|
2025-04-05 22:04:05 +08:00
|
|
|
|
"""
|
|
|
|
|
识别单元格内容的类型
|
|
|
|
|
|
|
|
|
|
通过正则表达式和文本特征分析,将单元格内容分类为不同类型:
|
|
|
|
|
- Dt: 日期类型
|
|
|
|
|
- Nu: 数字类型
|
|
|
|
|
- Ca: 代码/ID类型
|
|
|
|
|
- En: 英文文本
|
|
|
|
|
- NE: 数字和文本混合
|
|
|
|
|
- Sg: 单字符
|
|
|
|
|
- Tx: 短文本
|
|
|
|
|
- Lx: 长文本
|
|
|
|
|
- Nr: 人名
|
|
|
|
|
- Ot: 其他类型
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
b: 单元格文本内容
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
内容类型的字符串标识
|
|
|
|
|
"""
|
2025-03-24 11:19:28 +08:00
|
|
|
|
patt = [
|
|
|
|
|
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
|
|
|
|
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
|
|
|
|
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
|
|
|
|
|
("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
|
|
|
|
(r"^第*[一二三四1-4]季度$", "Dt"),
|
|
|
|
|
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
|
|
|
|
|
(r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
|
|
|
|
|
("^[0-9.,+%/ -]+$", "Nu"),
|
|
|
|
|
(r"^[0-9A-Z/\._~-]+$", "Ca"),
|
|
|
|
|
(r"^[A-Z]*[a-z' -]+$", "En"),
|
|
|
|
|
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
|
|
|
|
|
(r"^.{1}$", "Sg")
|
|
|
|
|
]
|
|
|
|
|
for p, n in patt:
|
|
|
|
|
if re.search(p, b):
|
|
|
|
|
return n
|
|
|
|
|
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
|
|
|
|
|
if len(tks) > 3:
|
|
|
|
|
if len(tks) < 12:
|
|
|
|
|
return "Tx"
|
|
|
|
|
else:
|
|
|
|
|
return "Lx"
|
|
|
|
|
|
|
|
|
|
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
|
|
|
|
return "Nr"
|
|
|
|
|
|
|
|
|
|
return "Ot"
|
|
|
|
|
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 表格至少需要两行才能处理
|
2025-03-24 11:19:28 +08:00
|
|
|
|
if len(df) < 2:
|
|
|
|
|
return []
|
2025-04-05 22:04:05 +08:00
|
|
|
|
|
|
|
|
|
# 统计表格中最常见的内容类型
|
2025-03-24 11:19:28 +08:00
|
|
|
|
max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
|
|
|
|
|
1, len(df)) for j in range(len(df.iloc[i, :]))])
|
|
|
|
|
max_type = max(max_type.items(), key=lambda x: x[1])[0]
|
|
|
|
|
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 获取表格列数
|
2025-03-24 11:19:28 +08:00
|
|
|
|
colnm = len(df.iloc[0, :])
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 默认第一行为表头
|
|
|
|
|
hdrows = [0] # 表头不一定出现在第一行
|
|
|
|
|
|
|
|
|
|
# 如果表格主要是数字类型,则识别非数字行作为表头
|
2025-03-24 11:19:28 +08:00
|
|
|
|
if max_type == "Nu":
|
|
|
|
|
for r in range(1, len(df)):
|
|
|
|
|
tys = Counter([blockType(str(df.iloc[r, j]))
|
|
|
|
|
for j in range(len(df.iloc[r, :]))])
|
|
|
|
|
tys = max(tys.items(), key=lambda x: x[1])[0]
|
|
|
|
|
if tys != max_type:
|
|
|
|
|
hdrows.append(r)
|
|
|
|
|
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 处理表格内容,将每行转换为文本
|
2025-03-24 11:19:28 +08:00
|
|
|
|
lines = []
|
|
|
|
|
for i in range(1, len(df)):
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 跳过表头行
|
2025-03-24 11:19:28 +08:00
|
|
|
|
if i in hdrows:
|
|
|
|
|
continue
|
2025-04-05 22:04:05 +08:00
|
|
|
|
|
|
|
|
|
# 计算当前行之前的表头行
|
2025-03-24 11:19:28 +08:00
|
|
|
|
hr = [r - i for r in hdrows]
|
|
|
|
|
hr = [r for r in hr if r < 0]
|
2025-04-05 22:04:05 +08:00
|
|
|
|
|
|
|
|
|
# 找到最近的连续表头行
|
2025-03-24 11:19:28 +08:00
|
|
|
|
t = len(hr) - 1
|
|
|
|
|
while t > 0:
|
|
|
|
|
if hr[t] - hr[t - 1] > 1:
|
|
|
|
|
hr = hr[t:]
|
|
|
|
|
break
|
|
|
|
|
t -= 1
|
2025-04-05 22:04:05 +08:00
|
|
|
|
|
|
|
|
|
# 为每列构建表头描述
|
2025-03-24 11:19:28 +08:00
|
|
|
|
headers = []
|
|
|
|
|
for j in range(len(df.iloc[i, :])):
|
|
|
|
|
t = []
|
|
|
|
|
for h in hr:
|
|
|
|
|
x = str(df.iloc[i + h, j]).strip()
|
|
|
|
|
if x in t:
|
|
|
|
|
continue
|
|
|
|
|
t.append(x)
|
|
|
|
|
t = ",".join(t)
|
|
|
|
|
if t:
|
|
|
|
|
t += ": "
|
|
|
|
|
headers.append(t)
|
2025-04-05 22:04:05 +08:00
|
|
|
|
|
|
|
|
|
# 构建每行的文本表示
|
2025-03-24 11:19:28 +08:00
|
|
|
|
cells = []
|
|
|
|
|
for j in range(len(df.iloc[i, :])):
|
|
|
|
|
if not str(df.iloc[i, j]):
|
|
|
|
|
continue
|
|
|
|
|
cells.append(headers[j] + str(df.iloc[i, j]))
|
|
|
|
|
lines.append(";".join(cells))
|
|
|
|
|
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 根据列数决定返回格式
|
2025-03-24 11:19:28 +08:00
|
|
|
|
if colnm > 3:
|
|
|
|
|
return lines
|
|
|
|
|
return ["\n".join(lines)]
|
|
|
|
|
|
|
|
|
|
def __call__(self, fnm, from_page=0, to_page=100000000):
|
2025-04-05 22:04:05 +08:00
|
|
|
|
"""
|
|
|
|
|
解析Word文档,提取指定页面范围内的文本和表格
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
fnm: 文件名或二进制内容
|
|
|
|
|
from_page: 起始页码(从0开始)
|
|
|
|
|
to_page: 结束页码
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
元组(secs, tbls),其中:
|
|
|
|
|
- secs: 段落内容列表,每项为(文本内容, 样式名称)的元组
|
|
|
|
|
- tbls: 表格内容列表
|
|
|
|
|
"""
|
|
|
|
|
# 根据输入类型创建Document对象
|
2025-03-24 11:19:28 +08:00
|
|
|
|
self.doc = Document(fnm) if isinstance(
|
|
|
|
|
fnm, str) else Document(BytesIO(fnm))
|
2025-04-05 22:04:05 +08:00
|
|
|
|
pn = 0 # 当前解析页码
|
|
|
|
|
secs = [] # 存储解析的段落内容
|
|
|
|
|
|
|
|
|
|
# 遍历文档中的所有段落
|
2025-03-24 11:19:28 +08:00
|
|
|
|
for p in self.doc.paragraphs:
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 如果超出指定页码范围,停止解析
|
2025-03-24 11:19:28 +08:00
|
|
|
|
if pn > to_page:
|
|
|
|
|
break
|
|
|
|
|
|
2025-04-05 22:04:05 +08:00
|
|
|
|
runs_within_single_paragraph = [] # 保存在页面范围内的文本片段
|
|
|
|
|
# 遍历段落中的所有文本片段(run)
|
2025-03-24 11:19:28 +08:00
|
|
|
|
for run in p.runs:
|
|
|
|
|
if pn > to_page:
|
|
|
|
|
break
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 如果当前页码在指定范围内且段落有内容,则添加文本
|
2025-03-24 11:19:28 +08:00
|
|
|
|
if from_page <= pn < to_page and p.text.strip():
|
2025-04-05 22:04:05 +08:00
|
|
|
|
runs_within_single_paragraph.append(run.text) # 先添加文本片段
|
2025-03-24 11:19:28 +08:00
|
|
|
|
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 检查页面分隔符
|
2025-03-24 11:19:28 +08:00
|
|
|
|
if 'lastRenderedPageBreak' in run._element.xml:
|
|
|
|
|
pn += 1
|
|
|
|
|
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 将段落文本和样式添加到结果列表
|
|
|
|
|
secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # 然后将文本片段连接为段落的一部分
|
2025-03-24 11:19:28 +08:00
|
|
|
|
|
2025-04-05 22:04:05 +08:00
|
|
|
|
# 提取所有表格内容
|
2025-03-24 11:19:28 +08:00
|
|
|
|
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
|
|
|
|
return secs, tbls
|