Merge pull request #147 from zstar1003/dev

refactor(agent): 移除 deepdoc 相关代码,防止误导deepwiki
This commit is contained in:
zstar 2025-06-05 10:14:36 +08:00 committed by GitHub
commit 1fbb0d58c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
60 changed files with 285 additions and 66406 deletions

View File

@ -7,7 +7,6 @@ WORKDIR /ragflow
# 复制 Python 相关代码目录
COPY api ./api
COPY conf ./conf
COPY deepdoc ./deepdoc
COPY rag ./rag
COPY agent ./agent
COPY graphrag ./graphrag

View File

@ -17,7 +17,6 @@ import json
import re
from abc import ABC
import requests
from deepdoc.parser import HtmlParser
from agent.component.base import ComponentBase, ComponentParamBase
@ -38,11 +37,11 @@ class InvokeParam(ComponentParamBase):
self.datatype = "json" # New parameter to determine data posting type
def check(self):
self.check_valid_value(self.method.lower(), "Type of content from the crawler", ['get', 'post', 'put'])
self.check_valid_value(self.method.lower(), "Type of content from the crawler", ["get", "post", "put"])
self.check_empty(self.url, "End point URL")
self.check_positive_integer(self.timeout, "Timeout time in second")
self.check_boolean(self.clean_html, "Clean HTML")
self.check_valid_value(self.datatype.lower(), "Data post type", ['json', 'formdata']) # Check for valid datapost value
self.check_valid_value(self.datatype.lower(), "Data post type", ["json", "formdata"]) # Check for valid datapost value
class Invoke(ComponentBase, ABC):
@ -52,9 +51,9 @@ class Invoke(ComponentBase, ABC):
args = {}
for para in self._param.variables:
if para.get("component_id"):
if '@' in para["component_id"]:
component = para["component_id"].split('@')[0]
field = para["component_id"].split('@')[1]
if "@" in para["component_id"]:
component = para["component_id"].split("@")[0]
field = para["component_id"].split("@")[1]
cpn = self._canvas.get_component(component)["obj"]
for param in cpn._param.query:
if param["key"] == field:
@ -83,50 +82,27 @@ class Invoke(ComponentBase, ABC):
if re.sub(r"https?:?/?/?", "", self._param.proxy):
proxies = {"http": self._param.proxy, "https": self._param.proxy}
if method == 'get':
response = requests.get(url=url,
params=args,
headers=headers,
proxies=proxies,
timeout=self._param.timeout)
if method == "get":
response = requests.get(url=url, params=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
if self._param.clean_html:
sections = HtmlParser()(None, response.content)
return Invoke.be_output("\n".join(sections))
return Invoke.be_output("\n")
return Invoke.be_output(response.text)
if method == 'put':
if self._param.datatype.lower() == 'json':
response = requests.put(url=url,
json=args,
headers=headers,
proxies=proxies,
timeout=self._param.timeout)
if method == "put":
if self._param.datatype.lower() == "json":
response = requests.put(url=url, json=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
else:
response = requests.put(url=url,
data=args,
headers=headers,
proxies=proxies,
timeout=self._param.timeout)
response = requests.put(url=url, data=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
if self._param.clean_html:
sections = HtmlParser()(None, response.content)
return Invoke.be_output("\n".join(sections))
return Invoke.be_output("\n".join())
return Invoke.be_output(response.text)
if method == 'post':
if self._param.datatype.lower() == 'json':
response = requests.post(url=url,
json=args,
headers=headers,
proxies=proxies,
timeout=self._param.timeout)
if method == "post":
if self._param.datatype.lower() == "json":
response = requests.post(url=url, json=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
else:
response = requests.post(url=url,
data=args,
headers=headers,
proxies=proxies,
timeout=self._param.timeout)
response = requests.post(url=url, data=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
if self._param.clean_html:
sections = HtmlParser()(None, response.content)
return Invoke.be_output("\n".join(sections))
return Invoke.be_output("\n".join())
return Invoke.be_output(response.text)

View File

@ -22,7 +22,6 @@ import flask
from flask import request
from flask_login import login_required, current_user
from deepdoc.parser.html_parser import RAGFlowHtmlParser
from rag.nlp import search
from api.db import FileType, TaskStatus, ParserType, FileSource
@ -49,50 +48,44 @@ from api.utils.web_utils import html2pdf, is_valid_url
from api.constants import IMG_BASE64_PREFIX
@manager.route('/upload', methods=['POST']) # noqa: F821
@manager.route("/upload", methods=["POST"]) # noqa: F821
@login_required
@validate_request("kb_id")
def upload():
kb_id = request.form.get("kb_id")
if not kb_id:
return get_json_result(
data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
if 'file' not in request.files:
return get_json_result(
data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
if "file" not in request.files:
return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)
file_objs = request.files.getlist('file')
file_objs = request.files.getlist("file")
for file_obj in file_objs:
if file_obj.filename == '':
return get_json_result(
data=False, message='No file selected!', code=settings.RetCode.ARGUMENT_ERROR)
if file_obj.filename == "":
return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
raise LookupError("Can't find this knowledgebase!")
err, files = FileService.upload_document(kb, file_objs, current_user.id)
files = [f[0] for f in files] # remove the blob
files = [f[0] for f in files] # remove the blob
if err:
return get_json_result(
data=files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
return get_json_result(data=files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
return get_json_result(data=files)
@manager.route('/web_crawl', methods=['POST']) # noqa: F821
@manager.route("/web_crawl", methods=["POST"]) # noqa: F821
@login_required
@validate_request("kb_id", "name", "url")
def web_crawl():
kb_id = request.form.get("kb_id")
if not kb_id:
return get_json_result(
data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
name = request.form.get("name")
url = request.form.get("url")
if not is_valid_url(url):
return get_json_result(
data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
raise LookupError("Can't find this knowledgebase!")
@ -108,10 +101,7 @@ def web_crawl():
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
try:
filename = duplicate_name(
DocumentService.query,
name=name + ".pdf",
kb_id=kb.id)
filename = duplicate_name(DocumentService.query, name=name + ".pdf", kb_id=kb.id)
filetype = filename_type(filename)
if filetype == FileType.OTHER.value:
raise RuntimeError("This type of file has not been supported yet!")
@ -130,7 +120,7 @@ def web_crawl():
"name": filename,
"location": location,
"size": len(blob),
"thumbnail": thumbnail(filename, blob)
"thumbnail": thumbnail(filename, blob),
}
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
@ -147,58 +137,53 @@ def web_crawl():
return get_json_result(data=True)
@manager.route('/create', methods=['POST']) # noqa: F821
@manager.route("/create", methods=["POST"]) # noqa: F821
@login_required
@validate_request("name", "kb_id")
def create():
req = request.json
kb_id = req["kb_id"]
if not kb_id:
return get_json_result(
data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
try:
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
return get_data_error_result(
message="Can't find this knowledgebase!")
return get_data_error_result(message="Can't find this knowledgebase!")
if DocumentService.query(name=req["name"], kb_id=kb_id):
return get_data_error_result(
message="Duplicated document name in the same knowledgebase.")
return get_data_error_result(message="Duplicated document name in the same knowledgebase.")
doc = DocumentService.insert({
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": FileType.VIRTUAL,
"name": req["name"],
"location": "",
"size": 0
})
doc = DocumentService.insert(
{
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": FileType.VIRTUAL,
"name": req["name"],
"location": "",
"size": 0,
}
)
return get_json_result(data=doc.to_json())
except Exception as e:
return server_error_response(e)
@manager.route('/list', methods=['GET']) # noqa: F821
@manager.route("/list", methods=["GET"]) # noqa: F821
@login_required
def list_docs():
kb_id = request.args.get("kb_id")
if not kb_id:
return get_json_result(
data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
tenants = UserTenantService.query(user_id=current_user.id)
for tenant in tenants:
if KnowledgebaseService.query(
tenant_id=tenant.tenant_id, id=kb_id):
if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
break
else:
return get_json_result(
data=False, message='Only owner of knowledgebase authorized for this operation.',
code=settings.RetCode.OPERATING_ERROR)
return get_json_result(data=False, message="Only owner of knowledgebase authorized for this operation.", code=settings.RetCode.OPERATING_ERROR)
keywords = request.args.get("keywords", "")
page_number = int(request.args.get("page", 1))
@ -206,70 +191,58 @@ def list_docs():
orderby = request.args.get("orderby", "create_time")
desc = request.args.get("desc", True)
try:
docs, tol = DocumentService.get_by_kb_id(
kb_id, page_number, items_per_page, orderby, desc, keywords)
docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords)
for doc_item in docs:
if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
doc_item['thumbnail'] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
return get_json_result(data={"total": tol, "docs": docs})
except Exception as e:
return server_error_response(e)
@manager.route('/infos', methods=['POST']) # noqa: F821
@manager.route("/infos", methods=["POST"]) # noqa: F821
@login_required
def docinfos():
req = request.json
doc_ids = req["doc_ids"]
for doc_id in doc_ids:
if not DocumentService.accessible(doc_id, current_user.id):
return get_json_result(
data=False,
message='No authorization.',
code=settings.RetCode.AUTHENTICATION_ERROR
)
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
docs = DocumentService.get_by_ids(doc_ids)
return get_json_result(data=list(docs.dicts()))
@manager.route('/thumbnails', methods=['GET']) # noqa: F821
@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
# @login_required
def thumbnails():
doc_ids = request.args.get("doc_ids").split(",")
if not doc_ids:
return get_json_result(
data=False, message='Lack of "Document ID"', code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message='Lack of "Document ID"', code=settings.RetCode.ARGUMENT_ERROR)
try:
docs = DocumentService.get_thumbnails(doc_ids)
for doc_item in docs:
if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
doc_item['thumbnail'] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
doc_item["thumbnail"] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"
return get_json_result(data={d["id"]: d["thumbnail"] for d in docs})
except Exception as e:
return server_error_response(e)
@manager.route('/change_status', methods=['POST']) # noqa: F821
@manager.route("/change_status", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id", "status")
def change_status():
req = request.json
if str(req["status"]) not in ["0", "1"]:
return get_json_result(
data=False,
message='"Status" must be either 0 or 1!',
code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message='"Status" must be either 0 or 1!', code=settings.RetCode.ARGUMENT_ERROR)
if not DocumentService.accessible(req["doc_id"], current_user.id):
return get_json_result(
data=False,
message='No authorization.',
code=settings.RetCode.AUTHENTICATION_ERROR)
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
try:
e, doc = DocumentService.get_by_id(req["doc_id"])
@ -277,23 +250,19 @@ def change_status():
return get_data_error_result(message="Document not found!")
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
if not e:
return get_data_error_result(
message="Can't find this knowledgebase!")
return get_data_error_result(message="Can't find this knowledgebase!")
if not DocumentService.update_by_id(
req["doc_id"], {"status": str(req["status"])}):
return get_data_error_result(
message="Database error (Document update)!")
if not DocumentService.update_by_id(req["doc_id"], {"status": str(req["status"])}):
return get_data_error_result(message="Database error (Document update)!")
status = int(req["status"])
settings.docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status},
search.index_name(kb.tenant_id), doc.kb_id)
settings.docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
@manager.route('/rm', methods=['POST']) # noqa: F821
@manager.route("/rm", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id")
def rm():
@ -304,11 +273,7 @@ def rm():
for doc_id in doc_ids:
if not DocumentService.accessible4deletion(doc_id, current_user.id):
return get_json_result(
data=False,
message='No authorization.',
code=settings.RetCode.AUTHENTICATION_ERROR
)
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
root_folder = FileService.get_root_folder(current_user.id)
pf_id = root_folder["id"]
@ -327,8 +292,7 @@ def rm():
TaskService.filter_delete([Task.doc_id == doc_id])
if not DocumentService.remove_document(doc, tenant_id):
return get_data_error_result(
message="Database error (Document removal)!")
return get_data_error_result(message="Database error (Document removal)!")
f2d = File2DocumentService.get_by_document_id(doc_id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
@ -344,20 +308,16 @@ def rm():
return get_json_result(data=True)
@manager.route('/run', methods=['POST']) # noqa: F821
@manager.route("/run", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids", "run")
def run():
def run():
req = request.json
# 检查用户是否有权限操作这些文档
for doc_id in req["doc_ids"]:
if not DocumentService.accessible(doc_id, current_user.id):
return get_json_result(
data=False,
message='No authorization.',
code=settings.RetCode.AUTHENTICATION_ERROR
)
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
try:
# 遍历所有需要处理的文档ID
for id in req["doc_ids"]:
@ -368,7 +328,7 @@ def run():
info["progress_msg"] = ""
info["chunk_num"] = 0
info["token_num"] = 0
# 更新文档状态
DocumentService.update_by_id(id, info)
# 获取租户ID
@ -386,7 +346,7 @@ def run():
# 如果索引存在,则删除索引中的文档数据
if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
# 如果是运行状态,则创建解析任务
if str(req["run"]) == TaskStatus.RUNNING.value:
e, doc = DocumentService.get_by_id(id)
@ -402,36 +362,25 @@ def run():
return server_error_response(e)
@manager.route('/rename', methods=['POST']) # noqa: F821
@manager.route("/rename", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id", "name")
def rename():
req = request.json
if not DocumentService.accessible(req["doc_id"], current_user.id):
return get_json_result(
data=False,
message='No authorization.',
code=settings.RetCode.AUTHENTICATION_ERROR
)
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
try:
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
doc.name.lower()).suffix:
return get_json_result(
data=False,
message="The extension of file can't be changed",
code=settings.RetCode.ARGUMENT_ERROR)
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
return get_json_result(data=False, message="The extension of file can't be changed", code=settings.RetCode.ARGUMENT_ERROR)
for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
if d.name == req["name"]:
return get_data_error_result(
message="Duplicated document name in the same knowledgebase.")
return get_data_error_result(message="Duplicated document name in the same knowledgebase.")
if not DocumentService.update_by_id(
req["doc_id"], {"name": req["name"]}):
return get_data_error_result(
message="Database error (Document rename)!")
if not DocumentService.update_by_id(req["doc_id"], {"name": req["name"]}):
return get_data_error_result(message="Database error (Document rename)!")
informs = File2DocumentService.get_by_document_id(req["doc_id"])
if informs:
@ -443,7 +392,7 @@ def rename():
return server_error_response(e)
@manager.route('/get/<doc_id>', methods=['GET']) # noqa: F821
@manager.route("/get/<doc_id>", methods=["GET"]) # noqa: F821
# @login_required
def get(doc_id):
try:
@ -457,29 +406,22 @@ def get(doc_id):
ext = re.search(r"\.([^.]+)$", doc.name)
if ext:
if doc.type == FileType.VISUAL.value:
response.headers.set('Content-Type', 'image/%s' % ext.group(1))
response.headers.set("Content-Type", "image/%s" % ext.group(1))
else:
response.headers.set(
'Content-Type',
'application/%s' %
ext.group(1))
response.headers.set("Content-Type", "application/%s" % ext.group(1))
return response
except Exception as e:
return server_error_response(e)
@manager.route('/change_parser', methods=['POST']) # noqa: F821
@manager.route("/change_parser", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id", "parser_id")
def change_parser():
req = request.json
if not DocumentService.accessible(req["doc_id"], current_user.id):
return get_json_result(
data=False,
message='No authorization.',
code=settings.RetCode.AUTHENTICATION_ERROR
)
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
try:
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
@ -491,21 +433,16 @@ def change_parser():
else:
return get_json_result(data=True)
if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
or (re.search(
r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
return get_data_error_result(message="Not supported yet!")
e = DocumentService.update_by_id(doc.id,
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
"run": TaskStatus.UNSTART.value})
e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value})
if not e:
return get_data_error_result(message="Document not found!")
if "parser_config" in req:
DocumentService.update_parser_config(doc.id, req["parser_config"])
if doc.token_num > 0:
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
doc.process_duation * -1)
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duation * -1)
if not e:
return get_data_error_result(message="Document not found!")
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
@ -519,7 +456,7 @@ def change_parser():
return server_error_response(e)
@manager.route('/image/<image_id>', methods=['GET']) # noqa: F821
@manager.route("/image/<image_id>", methods=["GET"]) # noqa: F821
# @login_required
def get_image(image_id):
try:
@ -528,60 +465,52 @@ def get_image(image_id):
return get_data_error_result(message="Image not found.")
bkt, nm = image_id.split("-")
response = flask.make_response(STORAGE_IMPL.get(bkt, nm))
response.headers.set('Content-Type', 'image/JPEG')
response.headers.set("Content-Type", "image/JPEG")
return response
except Exception as e:
return server_error_response(e)
@manager.route('/upload_and_parse', methods=['POST']) # noqa: F821
@manager.route("/upload_and_parse", methods=["POST"]) # noqa: F821
@login_required
@validate_request("conversation_id")
def upload_and_parse():
if 'file' not in request.files:
return get_json_result(
data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
if "file" not in request.files:
return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)
file_objs = request.files.getlist('file')
file_objs = request.files.getlist("file")
for file_obj in file_objs:
if file_obj.filename == '':
return get_json_result(
data=False, message='No file selected!', code=settings.RetCode.ARGUMENT_ERROR)
if file_obj.filename == "":
return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)
doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)
return get_json_result(data=doc_ids)
@manager.route('/parse', methods=['POST']) # noqa: F821
@manager.route("/parse", methods=["POST"]) # noqa: F821
@login_required
def parse():
url = request.json.get("url") if request.json else ""
if url:
if not is_valid_url(url):
return get_json_result(
data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
os.makedirs(download_path, exist_ok=True)
from seleniumwire.webdriver import Chrome, ChromeOptions
options = ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_experimental_option('prefs', {
'download.default_directory': download_path,
'download.prompt_for_download': False,
'download.directory_upgrade': True,
'safebrowsing.enabled': True
})
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_experimental_option("prefs", {"download.default_directory": download_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True})
driver = Chrome(options=options)
driver.get(url)
res_headers = [r.response.headers for r in driver.requests if r and r.response]
if len(res_headers) > 1:
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
driver.quit()
return get_json_result(data="\n".join(sections))
return get_json_result(data="\n".join())
class File:
filename: str
@ -597,51 +526,41 @@ def parse():
r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
if not r or not r.group(1):
return get_json_result(
data=False, message="Can't not identify downloaded file", code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message="Can't not identify downloaded file", code=settings.RetCode.ARGUMENT_ERROR)
f = File(r.group(1), os.path.join(download_path, r.group(1)))
txt = FileService.parse_docs([f], current_user.id)
return get_json_result(data=txt)
if 'file' not in request.files:
return get_json_result(
data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
if "file" not in request.files:
return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)
file_objs = request.files.getlist('file')
file_objs = request.files.getlist("file")
txt = FileService.parse_docs(file_objs, current_user.id)
return get_json_result(data=txt)
@manager.route('/set_meta', methods=['POST']) # noqa: F821
@manager.route("/set_meta", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id", "meta")
def set_meta():
req = request.json
if not DocumentService.accessible(req["doc_id"], current_user.id):
return get_json_result(
data=False,
message='No authorization.',
code=settings.RetCode.AUTHENTICATION_ERROR
)
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
try:
meta = json.loads(req["meta"])
except Exception as e:
return get_json_result(
data=False, message=f'Json syntax error: {e}', code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message=f"Json syntax error: {e}", code=settings.RetCode.ARGUMENT_ERROR)
if not isinstance(meta, dict):
return get_json_result(
data=False, message='Meta data should be in Json map format, like {"key": "value"}', code=settings.RetCode.ARGUMENT_ERROR)
return get_json_result(data=False, message='Meta data should be in Json map format, like {"key": "value"}', code=settings.RetCode.ARGUMENT_ERROR)
try:
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if not DocumentService.update_by_id(
req["doc_id"], {"meta_fields": meta}):
return get_data_error_result(
message="Database error (meta updates)!")
if not DocumentService.update_by_id(req["doc_id"], {"meta_fields": meta}):
return get_data_error_result(message="Database error (meta updates)!")
return get_json_result(data=True)
except Exception as e:

View File

@ -19,7 +19,6 @@ import xxhash
from datetime import datetime
from api.db.db_utils import bulk_insert_into_db
from deepdoc.parser import PdfParser
from peewee import JOIN
from api.db.db_models import DB, File2Document, File
from api.db import StatusEnum, FileType, TaskStatus
@ -27,7 +26,6 @@ from api.db.db_models import Task, Document, Knowledgebase, Tenant
from api.db.services.common_service import CommonService
from api.db.services.document_service import DocumentService
from api.utils import current_timestamp, get_uuid
from deepdoc.parser.excel_parser import RAGFlowExcelParser
from rag.settings import SVR_QUEUE_NAME
from rag.utils.storage_factory import STORAGE_IMPL
from rag.utils.redis_conn import REDIS_CONN
@ -40,8 +38,8 @@ def trim_header_by_lines(text: str, max_length) -> str:
if len_text <= max_length:
return text
for i in range(len_text):
if text[i] == '\n' and len_text - i <= max_length:
return text[i + 1:]
if text[i] == "\n" and len_text - i <= max_length:
return text[i + 1 :]
return text
@ -76,10 +74,10 @@ class TaskService(CommonService):
]
docs = (
cls.model.select(*fields)
.join(Document, on=(cls.model.doc_id == Document.id))
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id))
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
.where(cls.model.id == task_id)
.join(Document, on=(cls.model.doc_id == Document.id))
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id))
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
.where(cls.model.id == task_id)
)
docs = list(docs.dicts())
if not docs:
@ -112,10 +110,7 @@ class TaskService(CommonService):
cls.model.digest,
cls.model.chunk_ids,
]
tasks = (
cls.model.select(*fields).order_by(cls.model.from_page.asc(), cls.model.create_time.desc())
.where(cls.model.doc_id == doc_id)
)
tasks = cls.model.select(*fields).order_by(cls.model.from_page.asc(), cls.model.create_time.desc()).where(cls.model.doc_id == doc_id)
tasks = list(tasks.dicts())
if not tasks:
return None
@ -131,21 +126,19 @@ class TaskService(CommonService):
def get_ongoing_doc_name(cls):
with DB.lock("get_task", -1):
docs = (
cls.model.select(
*[Document.id, Document.kb_id, Document.location, File.parent_id]
)
.join(Document, on=(cls.model.doc_id == Document.id))
.join(
cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id])
.join(Document, on=(cls.model.doc_id == Document.id))
.join(
File2Document,
on=(File2Document.document_id == Document.id),
join_type=JOIN.LEFT_OUTER,
)
.join(
.join(
File,
on=(File2Document.file_id == File.id),
join_type=JOIN.LEFT_OUTER,
)
.where(
.where(
Document.status == StatusEnum.VALID.value,
Document.run == TaskStatus.RUNNING.value,
~(Document.type == FileType.VIRTUAL.value),
@ -185,9 +178,7 @@ class TaskService(CommonService):
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
if "progress" in info:
cls.model.update(progress=info["progress"]).where(
cls.model.id == id
).execute()
cls.model.update(progress=info["progress"]).where(cls.model.id == id).execute()
return
with DB.lock("update_progress", -1):
@ -196,23 +187,21 @@ class TaskService(CommonService):
progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
if "progress" in info:
cls.model.update(progress=info["progress"]).where(
cls.model.id == id
).execute()
cls.model.update(progress=info["progress"]).where(cls.model.id == id).execute()
def queue_tasks(doc: dict, bucket: str, name: str):
"""
将文档解析任务分割并加入队列处理
该函数根据文档类型(PDF表格等)将文档分割成多个子任务计算任务摘要
检查是否可以重用之前的任务结果并将未完成的任务加入Redis队列进行处理
参数:
doc (dict): 文档信息字典包含idtypeparser_idparser_config等信息
bucket (str): 存储桶名称
name (str): 文件名称
流程:
1. 根据文档类型(PDF/表格)将文档分割成多个子任务
2. 为每个任务生成唯一摘要(digest)
@ -221,10 +210,11 @@ def queue_tasks(doc: dict, bucket: str, name: str):
5. 将新任务批量插入数据库
6. 将未完成的任务加入Redis队列
"""
def new_task():
"""
创建一个新的任务字典包含基本任务信息
返回:
dict: 包含任务ID文档ID进度和页面范围的任务字典
"""
@ -240,7 +230,7 @@ def queue_tasks(doc: dict, bucket: str, name: str):
# 获取布局识别方式,默认为"DeepDOC"
do_layout = doc["parser_config"].get("layout_recognize", "DeepDOC")
# 获取PDF总页数
pages = PdfParser.total_page_number(doc["name"], file_bin)
pages = 1
# 获取每个任务处理的页数默认为12页
page_size = doc["parser_config"].get("task_page_size", 12)
# 对于学术论文类型默认任务页数为22
@ -248,9 +238,9 @@ def queue_tasks(doc: dict, bucket: str, name: str):
page_size = doc["parser_config"].get("task_page_size", 22)
# 对于特定解析器或非DeepDOC布局识别将整个文档作为一个任务处理
if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC":
page_size = 10 ** 9
page_size = 10**9
# 获取需要处理的页面范围,默认为全部页面
page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)]
page_ranges = doc["parser_config"].get("pages") or [(1, 10**5)]
# 根据页面范围和任务页数分割任务
for s, e in page_ranges:
# 调整页码从0开始
@ -269,14 +259,6 @@ def queue_tasks(doc: dict, bucket: str, name: str):
elif doc["parser_id"] == "table":
# 从存储中获取文件内容
file_bin = STORAGE_IMPL.get(bucket, name)
# 获取表格总行数
rn = RAGFlowExcelParser.row_number(doc["name"], file_bin)
# 每3000行作为一个任务
for i in range(0, rn, 3000):
task = new_task()
task["from_page"] = i
task["to_page"] = min(i + 3000, rn)
parse_task_array.append(task)
# 其他类型文档,整个文档作为一个任务处理
else:
parse_task_array.append(new_task())
@ -321,8 +303,7 @@ def queue_tasks(doc: dict, bucket: str, name: str):
chunk_ids.extend(task["chunk_ids"].split())
# 从文档存储中删除这些块
if chunk_ids:
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]),
chunking_config["kb_id"])
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
# 更新文档的块数量
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})
@ -335,17 +316,14 @@ def queue_tasks(doc: dict, bucket: str, name: str):
unfinished_task_array = [task for task in parse_task_array if task["progress"] < 1.0]
# 将未完成的任务加入Redis队列
for unfinished_task in unfinished_task_array:
assert REDIS_CONN.queue_product(
SVR_QUEUE_NAME, message=unfinished_task
), "Can't access Redis. Please check the Redis' status."
assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=unfinished_task), "Can't access Redis. Please check the Redis' status."
def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
idx = 0
while idx < len(prev_tasks):
prev_task = prev_tasks[idx]
if prev_task.get("from_page", 0) == task.get("from_page", 0) \
and prev_task.get("digest", 0) == task.get("digest", ""):
if prev_task.get("from_page", 0) == task.get("from_page", 0) and prev_task.get("digest", 0) == task.get("digest", ""):
break
idx += 1
@ -356,12 +334,11 @@ def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config:
return 0
task["chunk_ids"] = prev_task["chunk_ids"]
task["progress"] = 1.0
if "from_page" in task and "to_page" in task and int(task['to_page']) - int(task['from_page']) >= 10 ** 6:
if "from_page" in task and "to_page" in task and int(task["to_page"]) - int(task["from_page"]) >= 10**6:
task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): "
else:
task["progress_msg"] = ""
task["progress_msg"] = " ".join(
[datetime.now().strftime("%H:%M:%S"), task["progress_msg"], "Reused previous task's chunks."])
task["progress_msg"] = " ".join([datetime.now().strftime("%H:%M:%S"), task["progress_msg"], "Reused previous task's chunks."])
prev_task["chunk_ids"] = ""
return len(task["chunk_ids"].split())

View File

@ -1,122 +0,0 @@
English | [简体中文](./README_zh.md)
# *Deep*Doc
- [1. Introduction](#1)
- [2. Vision](#2)
- [3. Parser](#3)
<a name="1"></a>
## 1. Introduction
With a bunch of documents from various domains with various formats and along with diverse retrieval requirements,
an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
There are 2 parts in *Deep*Doc so far: vision and parser.
You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR.
```bash
python deepdoc/vision/t_ocr.py -h
usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
options:
-h, --help show this help message and exit
--inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
--output_dir OUTPUT_DIR
Directory where to store the output images. Default: './ocr_outputs'
```
```bash
python deepdoc/vision/t_recognizer.py -h
usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
options:
-h, --help show this help message and exit
--inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
--output_dir OUTPUT_DIR
Directory where to store the output images. Default: './layouts_outputs'
--threshold THRESHOLD
A threshold to filter out detections. Default: 0.5
--mode {layout,tsr} Task mode: layout recognition or table structure recognition
```
Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!!
```bash
export HF_ENDPOINT=https://hf-mirror.com
```
<a name="2"></a>
## 2. Vision
We use vision information to resolve problems as human being.
- OCR. Since a lot of documents presented as images or at least be able to transform to image,
OCR is a very essential and fundamental or even universal solution for text extraction.
```bash
python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
```
The inputs could be directory to images or PDF, or a image or PDF.
You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results,
txt files which contain the OCR text.
<div align="center" style="margin-top:20px;margin-bottom:20px;">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
</div>
- Layout recognition. Documents from different domain may have various layouts,
like, newspaper, magazine, book and résumé are distinct in terms of layout.
Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not,
or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption.
We have 10 basic layout components which covers most cases:
- Text
- Title
- Figure
- Figure caption
- Table
- Table caption
- Header
- Footer
- Reference
- Equation
Have a try on the following command to see the layout detection results.
```bash
python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
```
The inputs could be directory to images or PDF, or a image or PDF.
You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following:
<div align="center" style="margin-top:20px;margin-bottom:20px;">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
</div>
- Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text.
And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM.
We have five labels for TSR task:
- Column
- Row
- Column header
- Projected row header
- Spanning cell
Have a try on the following command to see the layout detection results.
```bash
python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
```
The inputs could be directory to images or PDF, or a image or PDF.
You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following:
<div align="center" style="margin-top:20px;margin-bottom:20px;">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
</div>
<a name="3"></a>
## 3. Parser
Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser.
The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes:
- Text chunks with their own positions in PDF(page number and rectangular positions).
- Tables with cropped image from the PDF, and contents which has already translated into natural language sentences.
- Figures with caption and text in the figures.
### Résumé
The résumé is a very complicated kind of document. A résumé which is composed of unstructured text
with various layouts could be resolved into structured data composed of nearly a hundred of fields.
We haven't opened the parser yet, as we open the processing method after parsing procedure.

View File

@ -1,116 +0,0 @@
[English](./README.md) | 简体中文
# *Deep*Doc
- [*Deep*Doc](#deepdoc)
- [1. 介绍](#1-介绍)
- [2. 视觉处理](#2-视觉处理)
- [3. 解析器](#3-解析器)
- [简历](#简历)
<a name="1"></a>
## 1. 介绍
对于来自不同领域、具有不同格式和不同检索要求的大量文档,准确的分析成为一项极具挑战性的任务。*Deep*Doc 就是为了这个目的而诞生的。到目前为止,*Deep*Doc 中有两个组成部分视觉处理和解析器。如果您对我们的OCR、布局识别和TSR结果感兴趣您可以运行下面的测试程序。
```bash
python deepdoc/vision/t_ocr.py -h
usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
options:
-h, --help show this help message and exit
--inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
--output_dir OUTPUT_DIR
Directory where to store the output images. Default: './ocr_outputs'
```
```bash
python deepdoc/vision/t_recognizer.py -h
usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
options:
-h, --help show this help message and exit
--inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
--output_dir OUTPUT_DIR
Directory where to store the output images. Default: './layouts_outputs'
--threshold THRESHOLD
A threshold to filter out detections. Default: 0.5
--mode {layout,tsr} Task mode: layout recognition or table structure recognition
```
HuggingFace为我们的模型提供服务。如果你在下载HuggingFace模型时遇到问题这可能会有所帮助
```bash
export HF_ENDPOINT=https://hf-mirror.com
```
<a name="2"></a>
## 2. 视觉处理
作为人类,我们使用视觉信息来解决问题。
- **OCROptical Character Recognition光学字符识别**。由于许多文档都是以图像形式呈现的或者至少能够转换为图像因此OCR是文本提取的一个非常重要、基本甚至通用的解决方案。
```bash
python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
```
输入可以是图像或PDF的目录或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` 其中有演示结果位置的图像以及包含OCR文本的txt文件。
<div align="center" style="margin-top:20px;margin-bottom:20px;">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
</div>
- 布局识别Layout recognition。来自不同领域的文件可能有不同的布局如报纸、杂志、书籍和简历在布局方面是不同的。只有当机器有准确的布局分析时它才能决定这些文本部分是连续的还是不连续的或者这个部分需要表结构识别Table Structure RecognitionTSR来处理或者这个部件是一个图形并用这个标题来描述。我们有10个基本布局组件涵盖了大多数情况
- 文本
- 标题
- 配图
- 配图标题
- 表格
- 表格标题
- 页头
- 页尾
- 参考引用
- 公式
请尝试以下命令以查看布局检测结果。
```bash
python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
```
输入可以是图像或PDF的目录或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中有显示检测结果的图像,如下所示:
<div align="center" style="margin-top:20px;margin-bottom:20px;">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
</div>
- **TSRTable Structure Recognition表结构识别**。数据表是一种常用的结构用于表示包括数字或文本在内的数据。表的结构可能非常复杂比如层次结构标题、跨单元格和投影行标题。除了TSR我们还将内容重新组合成LLM可以很好理解的句子。TSR任务有五个标签
- 列
- 行
- 列标题
- 行标题
- 合并单元格
请尝试以下命令以查看布局检测结果。
```bash
python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
```
输入可以是图像或PDF的目录或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` 其中包含图像和html页面这些页面展示了以下检测结果
<div align="center" style="margin-top:20px;margin-bottom:20px;">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
</div>
<a name="3"></a>
## 3. 解析器
PDF、DOCX、EXCEL和PPT四种文档格式都有相应的解析器。最复杂的是PDF解析器因为PDF具有灵活性。PDF解析器的输出包括
- 在PDF中有自己位置的文本块页码和矩形位置
- 带有PDF裁剪图像的表格以及已经翻译成自然语言句子的内容。
- 图中带标题和文字的图。
### 简历
简历是一种非常复杂的文档。由各种格式的非结构化文本构成的简历可以被解析为包含近百个字段的结构化数据。我们还没有启用解析器,因为在解析过程之后才会启动处理方法。

View File

@ -1,18 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from beartype.claw import beartype_this_package
beartype_this_package()

View File

@ -1,36 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
from .docx_parser import RAGFlowDocxParser as DocxParser
from .excel_parser import RAGFlowExcelParser as ExcelParser
from .ppt_parser import RAGFlowPptParser as PptParser
from .html_parser import RAGFlowHtmlParser as HtmlParser
from .json_parser import RAGFlowJsonParser as JsonParser
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
from .txt_parser import RAGFlowTxtParser as TxtParser
__all__ = [
"PdfParser",
"PlainParser",
"DocxParser",
"ExcelParser",
"PptParser",
"HtmlParser",
"JsonParser",
"MarkdownParser",
"TxtParser",
]

View File

@ -1,227 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from docx import Document
import re
import pandas as pd
from collections import Counter
from rag.nlp import rag_tokenizer
from io import BytesIO
class RAGFlowDocxParser:
"""
Word文档(.docx)解析器用于提取文档中的文本内容和表格
该解析器能够
1. 按页面范围提取文档中的段落文本及其样式
2. 识别文档中的表格并将其转换为结构化文本
3. 智能处理表格头部和内容生成语义化的文本描述
"""
def __extract_table_content(self, tb):
"""
从Word表格对象中提取内容并转换为DataFrame
参数:
tb: docx库的Table对象
返回:
处理后的表格内容文本列表
"""
df = []
for row in tb.rows:
df.append([c.text for c in row.cells])
return self.__compose_table_content(pd.DataFrame(df))
def __compose_table_content(self, df):
"""
将表格DataFrame转换为语义化的文本描述
通过识别表格的结构特征(如表头数据类型等)将表格转换为更易于理解的文本形式
参数:
df: 包含表格内容的DataFrame
返回:
表格内容的文本表示列表
"""
def blockType(b):
"""
识别单元格内容的类型
通过正则表达式和文本特征分析将单元格内容分类为不同类型
- Dt: 日期类型
- Nu: 数字类型
- Ca: 代码/ID类型
- En: 英文文本
- NE: 数字和文本混合
- Sg: 单字符
- Tx: 短文本
- Lx: 长文本
- Nr: 人名
- Ot: 其他类型
参数:
b: 单元格文本内容
返回:
内容类型的字符串标识
"""
patt = [
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
(r"^(20|19)[0-9]{2}年$", "Dt"),
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
(r"^第*[一二三四1-4]季度$", "Dt"),
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
(r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
("^[0-9.,+%/ -]+$", "Nu"),
(r"^[0-9A-Z/\._~-]+$", "Ca"),
(r"^[A-Z]*[a-z' -]+$", "En"),
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()' -]+$", "NE"),
(r"^.{1}$", "Sg")
]
for p, n in patt:
if re.search(p, b):
return n
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
if len(tks) > 3:
if len(tks) < 12:
return "Tx"
else:
return "Lx"
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
return "Nr"
return "Ot"
# 表格至少需要两行才能处理
if len(df) < 2:
return []
# 统计表格中最常见的内容类型
max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
1, len(df)) for j in range(len(df.iloc[i, :]))])
max_type = max(max_type.items(), key=lambda x: x[1])[0]
# 获取表格列数
colnm = len(df.iloc[0, :])
# 默认第一行为表头
hdrows = [0] # 表头不一定出现在第一行
# 如果表格主要是数字类型,则识别非数字行作为表头
if max_type == "Nu":
for r in range(1, len(df)):
tys = Counter([blockType(str(df.iloc[r, j]))
for j in range(len(df.iloc[r, :]))])
tys = max(tys.items(), key=lambda x: x[1])[0]
if tys != max_type:
hdrows.append(r)
# 处理表格内容,将每行转换为文本
lines = []
for i in range(1, len(df)):
# 跳过表头行
if i in hdrows:
continue
# 计算当前行之前的表头行
hr = [r - i for r in hdrows]
hr = [r for r in hr if r < 0]
# 找到最近的连续表头行
t = len(hr) - 1
while t > 0:
if hr[t] - hr[t - 1] > 1:
hr = hr[t:]
break
t -= 1
# 为每列构建表头描述
headers = []
for j in range(len(df.iloc[i, :])):
t = []
for h in hr:
x = str(df.iloc[i + h, j]).strip()
if x in t:
continue
t.append(x)
t = ",".join(t)
if t:
t += ": "
headers.append(t)
# 构建每行的文本表示
cells = []
for j in range(len(df.iloc[i, :])):
if not str(df.iloc[i, j]):
continue
cells.append(headers[j] + str(df.iloc[i, j]))
lines.append(";".join(cells))
# 根据列数决定返回格式
if colnm > 3:
return lines
return ["\n".join(lines)]
def __call__(self, fnm, from_page=0, to_page=100000000):
"""
解析Word文档提取指定页面范围内的文本和表格
参数:
fnm: 文件名或二进制内容
from_page: 起始页码(从0开始)
to_page: 结束页码
返回:
元组(secs, tbls)其中:
- secs: 段落内容列表每项为(文本内容, 样式名称)的元组
- tbls: 表格内容列表
"""
# 根据输入类型创建Document对象
self.doc = Document(fnm) if isinstance(
fnm, str) else Document(BytesIO(fnm))
pn = 0 # 当前解析页码
secs = [] # 存储解析的段落内容
# 遍历文档中的所有段落
for p in self.doc.paragraphs:
# 如果超出指定页码范围,停止解析
if pn > to_page:
break
runs_within_single_paragraph = [] # 保存在页面范围内的文本片段
# 遍历段落中的所有文本片段(run)
for run in p.runs:
if pn > to_page:
break
# 如果当前页码在指定范围内且段落有内容,则添加文本
if from_page <= pn < to_page and p.text.strip():
runs_within_single_paragraph.append(run.text) # 先添加文本片段
# 检查页面分隔符
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
# 将段落文本和样式添加到结果列表
secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # 然后将文本片段连接为段落的一部分
# 提取所有表格内容
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
return secs, tbls

View File

@ -1,150 +0,0 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import sys
from io import BytesIO
import pandas as pd
from openpyxl import Workbook, load_workbook
from rag.nlp import find_codec
class RAGFlowExcelParser:
@staticmethod
def _load_excel_to_workbook(file_like_object):
if isinstance(file_like_object, bytes):
file_like_object = BytesIO(file_like_object)
# Read first 4 bytes to determine file type
file_like_object.seek(0)
file_head = file_like_object.read(4)
file_like_object.seek(0)
if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook")
try:
file_like_object.seek(0)
df = pd.read_csv(file_like_object)
return RAGFlowExcelParser._dataframe_to_workbook(df)
except Exception as e_csv:
raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}")
try:
return load_workbook(file_like_object)
except Exception as e:
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
try:
file_like_object.seek(0)
df = pd.read_excel(file_like_object)
return RAGFlowExcelParser._dataframe_to_workbook(df)
except Exception as e_pandas:
raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
@staticmethod
def _dataframe_to_workbook(df):
wb = Workbook()
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
return wb
def html(self, fnm, chunk_rows=256):
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
tb_chunks = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
if not rows:
continue
tb_rows_0 = "<tr>"
for t in list(rows[0]):
tb_rows_0 += f"<th>{t.value}</th>"
tb_rows_0 += "</tr>"
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
tb = ""
tb += f"<table><caption>{sheetname}</caption>"
tb += tb_rows_0
for r in list(
rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
):
tb += "<tr>"
for i, c in enumerate(r):
if c.value is None:
tb += "<td></td>"
else:
tb += f"<td>{c.value}</td>"
tb += "</tr>"
tb += "</table>\n"
tb_chunks.append(tb)
return tb_chunks
def __call__(self, fnm):
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
res = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
if not rows:
continue
ti = list(rows[0])
for r in list(rows[1:]):
fields = []
for i, c in enumerate(r):
if not c.value:
continue
t = str(ti[i].value) if i < len(ti) else ""
t += ("" if t else "") + str(c.value)
fields.append(t)
line = "; ".join(fields)
if sheetname.lower().find("sheet") < 0:
line += " ——" + sheetname
res.append(line)
return res
@staticmethod
def row_number(fnm, binary):
if fnm.split(".")[-1].lower().find("xls") >= 0:
wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
ws = wb[sheetname]
total += len(list(ws.rows))
return total
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
return len(txt.split("\n"))
if __name__ == "__main__":
psr = RAGFlowExcelParser()
psr(sys.argv[1])

View File

@ -1,50 +0,0 @@
# -*- coding: utf-8 -*-
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from rag.nlp import find_codec
import readability
import html_text
import chardet
def get_encoding(file):
with open(file,'rb') as f:
tmp = chardet.detect(f.read())
return tmp['encoding']
class RAGFlowHtmlParser:
def __call__(self, fnm, binary=None):
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
txt = f.read()
return self.parser_txt(txt)
@classmethod
def parser_txt(cls, txt):
if not isinstance(txt, str):
raise TypeError("txt type should be str!")
html_doc = readability.Document(txt)
title = html_doc.title()
content = html_text.extract_text(html_doc.summary(html_partial=True))
txt = f"{title}\n{content}"
sections = txt.split("\n")
return sections

View File

@ -1,133 +0,0 @@
# -*- coding: utf-8 -*-
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The following documents are mainly referenced, and only adaptation modifications have been made
# from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
import json
from typing import Any
from rag.nlp import find_codec
class RAGFlowJsonParser:
def __init__(
self, max_chunk_size: int = 2000, min_chunk_size: int | None = None
):
super().__init__()
self.max_chunk_size = max_chunk_size * 2
self.min_chunk_size = (
min_chunk_size
if min_chunk_size is not None
else max(max_chunk_size - 200, 50)
)
def __call__(self, binary):
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
json_data = json.loads(txt)
chunks = self.split_json(json_data, True)
sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
return sections
@staticmethod
def _json_size(data: dict) -> int:
"""Calculate the size of the serialized JSON object."""
return len(json.dumps(data, ensure_ascii=False))
@staticmethod
def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
"""Set a value in a nested dictionary based on the given path."""
for key in path[:-1]:
d = d.setdefault(key, {})
d[path[-1]] = value
def _list_to_dict_preprocessing(self, data: Any) -> Any:
if isinstance(data, dict):
# Process each key-value pair in the dictionary
return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
elif isinstance(data, list):
# Convert the list to a dictionary with index-based keys
return {
str(i): self._list_to_dict_preprocessing(item)
for i, item in enumerate(data)
}
else:
# Base case: the item is neither a dict nor a list, so return it unchanged
return data
def _json_split(
self,
data,
current_path: list[str] | None,
chunks: list[dict] | None,
) -> list[dict]:
"""
Split json into maximum size dictionaries while preserving structure.
"""
current_path = current_path or []
chunks = chunks or [{}]
if isinstance(data, dict):
for key, value in data.items():
new_path = current_path + [key]
chunk_size = self._json_size(chunks[-1])
size = self._json_size({key: value})
remaining = self.max_chunk_size - chunk_size
if size < remaining:
# Add item to current chunk
self._set_nested_dict(chunks[-1], new_path, value)
else:
if chunk_size >= self.min_chunk_size:
# Chunk is big enough, start a new chunk
chunks.append({})
# Iterate
self._json_split(value, new_path, chunks)
else:
# handle single item
self._set_nested_dict(chunks[-1], current_path, data)
return chunks
def split_json(
self,
json_data,
convert_lists: bool = False,
) -> list[dict]:
"""Splits JSON into a list of JSON chunks"""
if convert_lists:
preprocessed_data = self._list_to_dict_preprocessing(json_data)
chunks = self._json_split(preprocessed_data, None, None)
else:
chunks = self._json_split(json_data, None, None)
# Remove the last chunk if it's empty
if not chunks[-1]:
chunks.pop()
return chunks
def split_text(
self,
json_data: dict[str, Any],
convert_lists: bool = False,
ensure_ascii: bool = True,
) -> list[str]:
"""Splits JSON into a list of JSON formatted strings"""
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
# Convert to string
return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]

View File

@ -1,77 +0,0 @@
# -*- coding: utf-8 -*-
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
class RAGFlowMarkdownParser:
def __init__(self, chunk_token_num=128):
self.chunk_token_num = int(chunk_token_num)
def extract_tables_and_remainder(self, markdown_text):
tables = []
remainder = markdown_text
if "|" in markdown_text: # for optimize performance
# Standard Markdown table
border_table_pattern = re.compile(
r'''
(?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+
''', re.VERBOSE)
border_tables = border_table_pattern.findall(markdown_text)
tables.extend(border_tables)
remainder = border_table_pattern.sub('', remainder)
# Borderless Markdown table
no_border_table_pattern = re.compile(
r'''
(?:\n|^)
(?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+
''', re.VERBOSE)
no_border_tables = no_border_table_pattern.findall(remainder)
tables.extend(no_border_tables)
remainder = no_border_table_pattern.sub('', remainder)
if "<table>" in remainder.lower(): # for optimize performance
#HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile(
r'''
(?:\n|^)
\s*
(?:
# case1: <html><body><table>...</table></body></html>
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
# case2: <body><table>...</table></body>
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
# case3: only<table>...</table>
(?:<table[^>]*>.*?</table>)
)
\s*
(?=\n|$)
''',
re.VERBOSE | re.DOTALL | re.IGNORECASE
)
html_tables = html_table_pattern.findall(remainder)
tables.extend(html_tables)
remainder = html_table_pattern.sub('', remainder)
return remainder, tables

File diff suppressed because it is too large Load Diff

View File

@ -1,68 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from io import BytesIO
from pptx import Presentation
class RAGFlowPptParser:
def __init__(self):
super().__init__()
def __extract(self, shape):
if shape.shape_type == 19:
tb = shape.table
rows = []
for i in range(1, len(tb.rows)):
rows.append("; ".join([tb.cell(
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
return "\n".join(rows)
if shape.has_text_frame:
return shape.text_frame.text
if shape.shape_type == 6:
texts = []
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
t = self.__extract(p)
if t:
texts.append(t)
return "\n".join(texts)
def __call__(self, fnm, from_page, to_page, callback=None):
ppt = Presentation(fnm) if isinstance(
fnm, str) else Presentation(
BytesIO(fnm))
txts = []
self.total_page = len(ppt.slides)
for i, slide in enumerate(ppt.slides):
if i < from_page:
continue
if i >= to_page:
break
texts = []
for shape in sorted(
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
try:
txt = self.__extract(shape)
if txt:
texts.append(txt)
except Exception as e:
logging.exception(e)
txts.append("\n".join(texts))
return txts

View File

@ -1,109 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
def refactor(cv):
for n in [
"raw_txt",
"parser_name",
"inference",
"ori_text",
"use_time",
"time_stat",
]:
if n in cv and cv[n] is not None:
del cv[n]
cv["is_deleted"] = 0
if "basic" not in cv:
cv["basic"] = {}
if cv["basic"].get("photo2"):
del cv["basic"]["photo2"]
for n in [
"education",
"work",
"certificate",
"project",
"language",
"skill",
"training",
]:
if n not in cv or cv[n] is None:
continue
if isinstance(cv[n], dict):
cv[n] = [v for _, v in cv[n].items()]
if not isinstance(cv[n], list):
del cv[n]
continue
vv = []
for v in cv[n]:
if "external" in v and v["external"] is not None:
del v["external"]
vv.append(v)
cv[n] = {str(i): vv[i] for i in range(len(vv))}
basics = [
("basic_salary_month", "salary_month"),
("expect_annual_salary_from", "expect_annual_salary"),
]
for n, t in basics:
if cv["basic"].get(n):
cv["basic"][t] = cv["basic"][n]
del cv["basic"][n]
work = sorted(
[v for _, v in cv.get("work", {}).items()],
key=lambda x: x.get("start_time", ""),
)
edu = sorted(
[v for _, v in cv.get("education", {}).items()],
key=lambda x: x.get("start_time", ""),
)
if work:
cv["basic"]["work_start_time"] = work[0].get("start_time", "")
cv["basic"]["management_experience"] = (
"Y"
if any([w.get("management_experience", "") == "Y" for w in work])
else "N"
)
cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
for n in [
"annual_salary_from",
"annual_salary_to",
"industry_name",
"position_name",
"responsibilities",
"corporation_type",
"scale",
"corporation_name",
]:
cv["basic"][n] = work[-1].get(n, "")
if edu:
for n in ["school_name", "discipline_name"]:
if n in edu[-1]:
cv["basic"][n] = edu[-1][n]
cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if "contact" not in cv:
cv["contact"] = {}
if not cv["contact"].get("name"):
cv["contact"]["name"] = cv["basic"].get("name", "")
return cv

View File

@ -1,15 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View File

@ -1,128 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
import json
import os
import pandas as pd
from rag.nlp import rag_tokenizer
from . import regions
current_file_path = os.path.dirname(os.path.abspath(__file__))
GOODS = pd.read_csv(
os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0
).fillna(0)
GOODS["cid"] = GOODS["cid"].astype(str)
GOODS = GOODS.set_index(["cid"])
CORP_TKS = json.load(
open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r",encoding="utf-8")
)
GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r",encoding="utf-8"))
CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r",encoding="utf-8"))
def baike(cid, default_v=0):
global GOODS
try:
return GOODS.loc[str(cid), "len"]
except Exception:
pass
return default_v
def corpNorm(nm, add_region=True):
global CORP_TKS
if not nm or not isinstance(nm, str):
return ""
nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
nm = re.sub(r"&amp;", "&", nm)
nm = re.sub(r"[\(\)\+'\"\t \*\\【】-]+", " ", nm)
nm = re.sub(
r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE
)
nm = re.sub(
r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
"",
nm,
10000,
re.IGNORECASE,
)
if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
return nm
tks = rag_tokenizer.tokenize(nm).split()
reg = [t for i, t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
nm = ""
for t in tks:
if regions.isName(t) or t in CORP_TKS:
continue
if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):
nm += " "
nm += t
r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
if r:
nm = r.group(1)
r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
if r:
nm = r.group(1)
return nm.strip() + (("" if not reg else "(%s)" % reg[0]) if add_region else "")
def rmNoise(n):
n = re.sub(r"[\(][^()]+[)]", "", n)
n = re.sub(r"[,. &()]+", "", n)
return n
GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
for c, v in CORP_TAG.items():
cc = corpNorm(rmNoise(c), False)
if not cc:
logging.debug(c)
CORP_TAG = {corpNorm(rmNoise(c), False): v for c, v in CORP_TAG.items()}
def is_good(nm):
global GOOD_CORP
if nm.find("外派") >= 0:
return False
nm = rmNoise(nm)
nm = corpNorm(nm, False)
for n in GOOD_CORP:
if re.match(r"[0-9a-zA-Z]+$", n):
if n == nm:
return True
elif nm.find(n) >= 0:
return True
return False
def corp_tag(nm):
global CORP_TAG
nm = rmNoise(nm)
nm = corpNorm(nm, False)
for n in CORP_TAG.keys():
if re.match(r"[0-9a-zA-Z., ]+$", n):
if n == nm:
return CORP_TAG[n]
elif nm.find(n) >= 0:
if len(n) < 3 and len(nm) / len(n) >= 2:
continue
return CORP_TAG[n]
return []

View File

@ -1,44 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
TBL = {
"94": "EMBA",
"6": "MBA",
"95": "MPA",
"92": "专升本",
"4": "专科",
"90": "中专",
"91": "中技",
"86": "初中",
"3": "博士",
"10": "博士后",
"1": "本科",
"2": "硕士",
"87": "职高",
"89": "高中",
}
TBL_ = {v: k for k, v in TBL.items()}
def get_name(id):
return TBL.get(str(id), "")
def get_id(nm):
if not nm:
return ""
return TBL_.get(nm.upper().strip(), "")

View File

@ -1,712 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
TBL = {
"1": {"name": "IT/通信/电子", "parent": "0"},
"2": {"name": "互联网", "parent": "0"},
"3": {"name": "电子商务", "parent": "2"},
"4": {"name": "互联网金融", "parent": "2"},
"5": {"name": "网络游戏", "parent": "2"},
"6": {"name": "社交网络平台", "parent": "2"},
"7": {"name": "视频音乐", "parent": "2"},
"9": {"name": "安全", "parent": "2"},
"10": {"name": "云计算", "parent": "2"},
"12": {"name": "工具类客户端应用", "parent": "2"},
"13": {"name": "互联网广告", "parent": "2"},
"14": {"name": "企业互联网服务", "parent": "2"},
"16": {"name": "在线教育", "parent": "2"},
"17": {"name": "在线医疗", "parent": "2"},
"19": {"name": "B2B", "parent": "3"},
"20": {"name": "B2C", "parent": "3"},
"21": {"name": "C2C", "parent": "3"},
"22": {"name": "生活信息本地化", "parent": "3"},
"23": {"name": "在线旅游", "parent": "2"},
"24": {"name": "第三方支付", "parent": "4"},
"26": {"name": "客户端游戏", "parent": "5"},
"27": {"name": "网页游戏", "parent": "5"},
"28": {"name": "手机游戏", "parent": "5"},
"29": {"name": "微博", "parent": "6"},
"30": {"name": "社交网站", "parent": "6"},
"31": {"name": "在线视频", "parent": "7"},
"32": {"name": "在线音乐", "parent": "7"},
"35": {"name": "企业安全", "parent": "9"},
"36": {"name": "个人安全", "parent": "9"},
"37": {"name": "企业级云服务", "parent": "10"},
"38": {"name": "个人级云服务", "parent": "10"},
"43": {"name": "输入法", "parent": "12"},
"44": {"name": "浏览器", "parent": "12"},
"45": {"name": "词典", "parent": "12"},
"46": {"name": "播放器", "parent": "12"},
"47": {"name": "下载器", "parent": "12"},
"48": {"name": "IM", "parent": "12"},
"49": {"name": "广告服务", "parent": "13"},
"50": {"name": "第三方广告网络平台", "parent": "13"},
"51": {"name": "媒体代理", "parent": "13"},
"52": {"name": "创意代理", "parent": "13"},
"53": {"name": "IT-综合", "parent": "1"},
"71": {"name": "团购", "parent": "3"},
"72": {"name": "地图", "parent": "2"},
"73": {"name": "数据存储", "parent": "2"},
"414": {"name": "计算机软件", "parent": "1"},
"415": {"name": "计算机硬件", "parent": "1"},
"416": {"name": "计算机服务(系统、数据服务、维修)", "parent": "1"},
"417": {"name": "通信/电信/网络设备", "parent": "1"},
"418": {"name": "通信/电信运营、增值服务", "parent": "1"},
"419": {"name": "电子技术/半导体/集成电路", "parent": "1"},
"472": {"name": "P2P网贷", "parent": "4"},
"473": {"name": "互联网理财", "parent": "4"},
"474": {"name": "婚恋", "parent": "6"},
"476": {"name": "虚拟化", "parent": "10"},
"477": {"name": "邮箱", "parent": "12"},
"478": {"name": "商业智能", "parent": "14"},
"479": {"name": "企业建站", "parent": "14"},
"480": {"name": "安防", "parent": "14"},
"481": {"name": "网络营销", "parent": "2"},
"487": {"name": "智能终端", "parent": "2"},
"488": {"name": "移动互联网", "parent": "2"},
"489": {"name": "数字城市", "parent": "2"},
"490": {"name": "大数据", "parent": "2"},
"491": {"name": "互联网人力资源", "parent": "2"},
"492": {"name": "舆情监控", "parent": "2"},
"493": {"name": "移动营销", "parent": "481"},
"494": {"name": "微博营销", "parent": "481"},
"495": {"name": "精准营销", "parent": "481"},
"496": {"name": "海外营销", "parent": "481"},
"497": {"name": "微信营销", "parent": "481"},
"498": {"name": "智能手机", "parent": "487"},
"499": {"name": "可穿戴设备", "parent": "487"},
"500": {"name": "智能电视", "parent": "487"},
"501": {"name": "WAP", "parent": "488"},
"502": {"name": "物联网", "parent": "489"},
"503": {"name": "O2O", "parent": "489"},
"504": {"name": "数字出版", "parent": "489"},
"505": {"name": "搜索", "parent": "2"},
"506": {"name": "垂直搜索", "parent": "505"},
"507": {"name": "无线搜索", "parent": "505"},
"508": {"name": "网页搜索", "parent": "505"},
"509": {"name": "网址导航", "parent": "2"},
"510": {"name": "门户", "parent": "2"},
"511": {"name": "网络文学", "parent": "2"},
"512": {"name": "自媒体", "parent": "2"},
"513": {"name": "金融", "parent": "0"},
"514": {"name": "建筑与房地产", "parent": "0"},
"515": {"name": "专业服务", "parent": "0"},
"516": {"name": "教育培训", "parent": "0"},
"517": {"name": "文化传媒", "parent": "0"},
"518": {"name": "消费品", "parent": "0"},
"519": {"name": "工业", "parent": "0"},
"520": {"name": "交通物流", "parent": "0"},
"521": {"name": "贸易", "parent": "0"},
"522": {"name": "医药", "parent": "0"},
"523": {"name": "医疗器械", "parent": "522"},
"524": {"name": "保健品", "parent": "518"},
"525": {"name": "服务业", "parent": "0"},
"526": {"name": "能源/矿产/环保", "parent": "0"},
"527": {"name": "化工", "parent": "0"},
"528": {"name": "政府", "parent": "0"},
"529": {"name": "公共事业", "parent": "0"},
"530": {"name": "非盈利机构", "parent": "0"},
"531": {"name": "农业", "parent": "1131"},
"532": {"name": "林业", "parent": "1131"},
"533": {"name": "畜牧业", "parent": "1131"},
"534": {"name": "渔业", "parent": "1131"},
"535": {"name": "学术科研", "parent": "0"},
"536": {"name": "零售", "parent": "0"},
"537": {"name": "银行", "parent": "513"},
"538": {"name": "保险", "parent": "513"},
"539": {"name": "证券", "parent": "513"},
"540": {"name": "基金", "parent": "513"},
"541": {"name": "信托", "parent": "513"},
"542": {"name": "担保", "parent": "513"},
"543": {"name": "典当", "parent": "513"},
"544": {"name": "拍卖", "parent": "513"},
"545": {"name": "投资/融资", "parent": "513"},
"546": {"name": "期货", "parent": "513"},
"547": {"name": "房地产开发", "parent": "514"},
"548": {"name": "工程施工", "parent": "514"},
"549": {"name": "建筑设计", "parent": "514"},
"550": {"name": "房地产代理", "parent": "514"},
"551": {"name": "物业管理", "parent": "514"},
"552": {"name": "室内设计", "parent": "514"},
"553": {"name": "装修装潢", "parent": "514"},
"554": {"name": "市政工程", "parent": "514"},
"555": {"name": "工程造价", "parent": "514"},
"556": {"name": "工程监理", "parent": "514"},
"557": {"name": "环境工程", "parent": "514"},
"558": {"name": "园林景观", "parent": "514"},
"559": {"name": "法律", "parent": "515"},
"560": {"name": "人力资源", "parent": "515"},
"561": {"name": "会计", "parent": "1125"},
"562": {"name": "审计", "parent": "515"},
"563": {"name": "检测认证", "parent": "515"},
"565": {"name": "翻译", "parent": "515"},
"566": {"name": "中介", "parent": "515"},
"567": {"name": "咨询", "parent": "515"},
"568": {"name": "外包服务", "parent": "515"},
"569": {"name": "家教", "parent": "516"},
"570": {"name": "早教", "parent": "516"},
"571": {"name": "职业技能培训", "parent": "516"},
"572": {"name": "外语培训", "parent": "516"},
"573": {"name": "设计培训", "parent": "516"},
"574": {"name": "IT培训", "parent": "516"},
"575": {"name": "文艺体育培训", "parent": "516"},
"576": {"name": "学历教育", "parent": "516"},
"577": {"name": "管理培训", "parent": "516"},
"578": {"name": "民办基础教育", "parent": "516"},
"579": {"name": "广告", "parent": "517"},
"580": {"name": "媒体", "parent": "517"},
"581": {"name": "会展", "parent": "517"},
"582": {"name": "公关", "parent": "517"},
"583": {"name": "影视", "parent": "517"},
"584": {"name": "艺术", "parent": "517"},
"585": {"name": "文化传播", "parent": "517"},
"586": {"name": "娱乐", "parent": "517"},
"587": {"name": "体育", "parent": "517"},
"588": {"name": "出版", "parent": "517"},
"589": {"name": "休闲", "parent": "517"},
"590": {"name": "动漫", "parent": "517"},
"591": {"name": "市场推广", "parent": "517"},
"592": {"name": "市场研究", "parent": "517"},
"593": {"name": "食品", "parent": "1129"},
"594": {"name": "饮料", "parent": "1129"},
"595": {"name": "烟草", "parent": "1129"},
"596": {"name": "酒品", "parent": "518"},
"597": {"name": "服饰", "parent": "518"},
"598": {"name": "纺织", "parent": "518"},
"599": {"name": "化妆品", "parent": "1129"},
"600": {"name": "日用品", "parent": "1129"},
"601": {"name": "家电", "parent": "518"},
"602": {"name": "家具", "parent": "518"},
"603": {"name": "办公用品", "parent": "518"},
"604": {"name": "奢侈品", "parent": "518"},
"605": {"name": "珠宝", "parent": "518"},
"606": {"name": "数码产品", "parent": "518"},
"607": {"name": "玩具", "parent": "518"},
"608": {"name": "图书", "parent": "518"},
"609": {"name": "音像", "parent": "518"},
"610": {"name": "钟表", "parent": "518"},
"611": {"name": "箱包", "parent": "518"},
"612": {"name": "母婴", "parent": "518"},
"613": {"name": "营养保健", "parent": "518"},
"614": {"name": "户外用品", "parent": "518"},
"615": {"name": "健身器材", "parent": "518"},
"616": {"name": "乐器", "parent": "518"},
"617": {"name": "汽车用品", "parent": "518"},
"619": {"name": "厨具", "parent": "518"},
"620": {"name": "机械制造", "parent": "519"},
"621": {"name": "流体控制", "parent": "519"},
"622": {"name": "自动化控制", "parent": "519"},
"623": {"name": "仪器仪表", "parent": "519"},
"624": {"name": "航空/航天", "parent": "519"},
"625": {"name": "交通设施", "parent": "519"},
"626": {"name": "工业电子", "parent": "519"},
"627": {"name": "建材", "parent": "519"},
"628": {"name": "五金材料", "parent": "519"},
"629": {"name": "汽车", "parent": "519"},
"630": {"name": "印刷", "parent": "519"},
"631": {"name": "造纸", "parent": "519"},
"632": {"name": "包装", "parent": "519"},
"633": {"name": "原材料及加工", "parent": "519"},
"634": {"name": "物流", "parent": "520"},
"635": {"name": "仓储", "parent": "520"},
"636": {"name": "客运", "parent": "520"},
"637": {"name": "快递", "parent": "520"},
"638": {"name": "化学药", "parent": "522"},
"639": {"name": "中药", "parent": "522"},
"640": {"name": "生物制药", "parent": "522"},
"641": {"name": "兽药", "parent": "522"},
"642": {"name": "农药", "parent": "522"},
"643": {"name": "CRO", "parent": "522"},
"644": {"name": "消毒", "parent": "522"},
"645": {"name": "医药商业", "parent": "522"},
"646": {"name": "医疗服务", "parent": "522"},
"647": {"name": "医疗器械", "parent": "523"},
"648": {"name": "制药设备", "parent": "523"},
"649": {"name": "医用耗材", "parent": "523"},
"650": {"name": "手术器械", "parent": "523"},
"651": {"name": "保健器材", "parent": "524"},
"652": {"name": "性保健品", "parent": "524"},
"653": {"name": "医药保养", "parent": "524"},
"654": {"name": "医用保健", "parent": "524"},
"655": {"name": "酒店", "parent": "525"},
"656": {"name": "餐饮", "parent": "525"},
"657": {"name": "旅游", "parent": "525"},
"658": {"name": "生活服务", "parent": "525"},
"659": {"name": "保健服务", "parent": "525"},
"660": {"name": "运动健身", "parent": "525"},
"661": {"name": "家政服务", "parent": "525"},
"662": {"name": "婚庆服务", "parent": "525"},
"663": {"name": "租赁服务", "parent": "525"},
"664": {"name": "维修服务", "parent": "525"},
"665": {"name": "石油天然气", "parent": "526"},
"666": {"name": "电力", "parent": "526"},
"667": {"name": "新能源", "parent": "526"},
"668": {"name": "水利", "parent": "526"},
"669": {"name": "矿产", "parent": "526"},
"670": {"name": "采掘业", "parent": "526"},
"671": {"name": "冶炼", "parent": "526"},
"672": {"name": "环保", "parent": "526"},
"673": {"name": "无机化工原料", "parent": "527"},
"674": {"name": "有机化工原料", "parent": "527"},
"675": {"name": "精细化学品", "parent": "527"},
"676": {"name": "化工设备", "parent": "527"},
"677": {"name": "化工工程", "parent": "527"},
"678": {"name": "资产管理", "parent": "513"},
"679": {"name": "金融租赁", "parent": "513"},
"680": {"name": "征信及信评机构", "parent": "513"},
"681": {"name": "资产评估机构", "parent": "513"},
"683": {"name": "金融监管机构", "parent": "513"},
"684": {"name": "国际贸易", "parent": "521"},
"685": {"name": "海关", "parent": "521"},
"686": {"name": "购物中心", "parent": "536"},
"687": {"name": "超市", "parent": "536"},
"688": {"name": "便利店", "parent": "536"},
"689": {"name": "专卖店", "parent": "536"},
"690": {"name": "专业店", "parent": "536"},
"691": {"name": "百货店", "parent": "536"},
"692": {"name": "杂货店", "parent": "536"},
"693": {"name": "个人银行", "parent": "537"},
"695": {"name": "私人银行", "parent": "537"},
"696": {"name": "公司银行", "parent": "537"},
"697": {"name": "投资银行", "parent": "537"},
"698": {"name": "政策性银行", "parent": "537"},
"699": {"name": "中央银行", "parent": "537"},
"700": {"name": "人寿险", "parent": "538"},
"701": {"name": "财产险", "parent": "538"},
"702": {"name": "再保险", "parent": "538"},
"703": {"name": "养老险", "parent": "538"},
"704": {"name": "保险代理公司", "parent": "538"},
"705": {"name": "公募基金", "parent": "540"},
"707": {"name": "私募基金", "parent": "540"},
"708": {"name": "第三方理财", "parent": "679"},
"709": {"name": "资产管理公司", "parent": "679"},
"711": {"name": "房产中介", "parent": "566"},
"712": {"name": "职业中介", "parent": "566"},
"713": {"name": "婚姻中介", "parent": "566"},
"714": {"name": "战略咨询", "parent": "567"},
"715": {"name": "投资咨询", "parent": "567"},
"716": {"name": "心理咨询", "parent": "567"},
"717": {"name": "留学移民咨询", "parent": "567"},
"718": {"name": "工商注册代理", "parent": "568"},
"719": {"name": "商标专利代理", "parent": "568"},
"720": {"name": "财务代理", "parent": "568"},
"721": {"name": "工程机械", "parent": "620"},
"722": {"name": "农业机械", "parent": "620"},
"723": {"name": "海工设备", "parent": "620"},
"724": {"name": "包装机械", "parent": "620"},
"725": {"name": "印刷机械", "parent": "620"},
"726": {"name": "数控机床", "parent": "620"},
"727": {"name": "矿山机械", "parent": "620"},
"728": {"name": "水泵", "parent": "621"},
"729": {"name": "管道", "parent": "621"},
"730": {"name": "阀门", "parent": "621"},
"732": {"name": "压缩机", "parent": "621"},
"733": {"name": "集散控制系统", "parent": "622"},
"734": {"name": "远程控制", "parent": "622"},
"735": {"name": "液压系统", "parent": "622"},
"736": {"name": "楼宇智能化", "parent": "622"},
"737": {"name": "飞机制造", "parent": "624"},
"738": {"name": "航空公司", "parent": "624"},
"739": {"name": "发动机", "parent": "624"},
"740": {"name": "复合材料", "parent": "624"},
"741": {"name": "高铁", "parent": "625"},
"742": {"name": "地铁", "parent": "625"},
"743": {"name": "信号传输", "parent": "625"},
"745": {"name": "结构材料", "parent": "627"},
"746": {"name": "装饰材料", "parent": "627"},
"747": {"name": "专用材料", "parent": "627"},
"749": {"name": "经销商集团", "parent": "629"},
"750": {"name": "整车制造", "parent": "629"},
"751": {"name": "汽车零配件", "parent": "629"},
"752": {"name": "外型设计", "parent": "629"},
"753": {"name": "平版印刷", "parent": "630"},
"754": {"name": "凸版印刷", "parent": "630"},
"755": {"name": "凹版印刷", "parent": "630"},
"756": {"name": "孔版印刷", "parent": "630"},
"757": {"name": "印刷用纸", "parent": "631"},
"758": {"name": "书写、制图及复制用纸", "parent": "631"},
"759": {"name": "包装用纸", "parent": "631"},
"760": {"name": "生活、卫生及装饰用纸", "parent": "631"},
"761": {"name": "技术用纸", "parent": "631"},
"762": {"name": "加工纸原纸", "parent": "631"},
"763": {"name": "食品包装", "parent": "632"},
"764": {"name": "医药包装", "parent": "632"},
"765": {"name": "日化包装", "parent": "632"},
"766": {"name": "物流包装", "parent": "632"},
"767": {"name": "礼品包装", "parent": "632"},
"768": {"name": "电子五金包装", "parent": "632"},
"769": {"name": "汽车服务", "parent": "525"},
"770": {"name": "汽车保养", "parent": "769"},
"771": {"name": "租车", "parent": "769"},
"773": {"name": "出租车", "parent": "769"},
"774": {"name": "代驾", "parent": "769"},
"775": {"name": "发电", "parent": "666"},
"777": {"name": "输配电", "parent": "666"},
"779": {"name": "风电", "parent": "667"},
"780": {"name": "光伏/太阳能", "parent": "667"},
"781": {"name": "生物质发电", "parent": "667"},
"782": {"name": "煤化工", "parent": "667"},
"783": {"name": "垃圾发电", "parent": "667"},
"784": {"name": "核电", "parent": "667"},
"785": {"name": "能源矿产", "parent": "669"},
"786": {"name": "金属矿产", "parent": "669"},
"787": {"name": "非金属矿产", "parent": "669"},
"788": {"name": "水气矿产", "parent": "669"},
"789": {"name": "锅炉", "parent": "775"},
"790": {"name": "发电机", "parent": "775"},
"791": {"name": "汽轮机", "parent": "775"},
"792": {"name": "燃机", "parent": "775"},
"793": {"name": "冷却", "parent": "775"},
"794": {"name": "电力设计院", "parent": "775"},
"795": {"name": "高压输配电", "parent": "777"},
"796": {"name": "中压输配电", "parent": "777"},
"797": {"name": "低压输配电", "parent": "777"},
"798": {"name": "继电保护", "parent": "777"},
"799": {"name": "智能电网", "parent": "777"},
"800": {"name": "小学", "parent": "516"},
"801": {"name": "电动车", "parent": "519"},
"802": {"name": "皮具箱包", "parent": "518"},
"803": {"name": "医药制造", "parent": "522"},
"804": {"name": "电器销售", "parent": "536"},
"805": {"name": "塑料制品", "parent": "527"},
"806": {"name": "公益基金会", "parent": "530"},
"807": {"name": "美发服务", "parent": "525"},
"808": {"name": "农业养殖", "parent": "531"},
"809": {"name": "金融服务", "parent": "513"},
"810": {"name": "商业地产综合体", "parent": "514"},
"811": {"name": "美容服务", "parent": "525"},
"812": {"name": "灯饰", "parent": "518"},
"813": {"name": "油墨颜料产品", "parent": "527"},
"814": {"name": "眼镜制造", "parent": "518"},
"815": {"name": "农业生物技术", "parent": "531"},
"816": {"name": "体育用品", "parent": "518"},
"817": {"name": "保健用品", "parent": "524"},
"818": {"name": "化学化工产品", "parent": "527"},
"819": {"name": "饲料", "parent": "531"},
"821": {"name": "保安服务", "parent": "525"},
"822": {"name": "干细胞技术", "parent": "522"},
"824": {"name": "农药化肥", "parent": "527"},
"825": {"name": "卫生洁具", "parent": "518"},
"826": {"name": "体育器材、场馆", "parent": "518"},
"827": {"name": "饲料加工", "parent": "531"},
"828": {"name": "测绘服务", "parent": "529"},
"830": {"name": "金属船舶制造", "parent": "519"},
"831": {"name": "基因工程", "parent": "522"},
"832": {"name": "花卉服务", "parent": "536"},
"833": {"name": "农业种植", "parent": "531"},
"834": {"name": "皮革制品", "parent": "518"},
"835": {"name": "地理信息加工服务", "parent": "529"},
"836": {"name": "机器人", "parent": "519"},
"837": {"name": "礼品", "parent": "518"},
"838": {"name": "理发及美容服务", "parent": "525"},
"839": {"name": "其他清洁服务", "parent": "525"},
"840": {"name": "硅胶材料", "parent": "527"},
"841": {"name": "茶叶销售", "parent": "518"},
"842": {"name": "彩票活动", "parent": "529"},
"843": {"name": "化妆培训", "parent": "516"},
"844": {"name": "鞋业", "parent": "518"},
"845": {"name": "酒店用品", "parent": "518"},
"846": {"name": "复合材料", "parent": "527"},
"847": {"name": "房地产工程建设", "parent": "548"},
"848": {"name": "知识产权服务", "parent": "559"},
"849": {"name": "新型建材", "parent": "627"},
"850": {"name": "企业投资咨询", "parent": "567"},
"851": {"name": "含乳饮料和植物蛋白饮料制造", "parent": "594"},
"852": {"name": "汽车检测设备", "parent": "629"},
"853": {"name": "手机通讯器材", "parent": "417"},
"854": {"name": "环保材料", "parent": "672"},
"855": {"name": "交通设施", "parent": "554"},
"856": {"name": "电子器件", "parent": "419"},
"857": {"name": "啤酒", "parent": "594"},
"858": {"name": "生态旅游", "parent": "657"},
"859": {"name": "自动化设备", "parent": "626"},
"860": {"name": "软件开发", "parent": "414"},
"861": {"name": "葡萄酒销售", "parent": "594"},
"862": {"name": "钢材", "parent": "633"},
"863": {"name": "餐饮培训", "parent": "656"},
"864": {"name": "速冻食品", "parent": "593"},
"865": {"name": "空气环保", "parent": "672"},
"866": {"name": "互联网房地产经纪服务", "parent": "550"},
"867": {"name": "食品添加剂", "parent": "593"},
"868": {"name": "演艺传播", "parent": "585"},
"869": {"name": "信用卡", "parent": "537"},
"870": {"name": "报纸期刊广告", "parent": "579"},
"871": {"name": "摄影", "parent": "525"},
"872": {"name": "手机软件", "parent": "414"},
"873": {"name": "地坪建材", "parent": "627"},
"874": {"name": "企业管理咨询", "parent": "567"},
"875": {"name": "幼儿教育", "parent": "570"},
"876": {"name": "系统集成", "parent": "416"},
"877": {"name": "皮革服饰", "parent": "597"},
"878": {"name": "保健食品", "parent": "593"},
"879": {"name": "叉车", "parent": "620"},
"880": {"name": "厨卫电器", "parent": "601"},
"882": {"name": "地暖设备", "parent": "627"},
"883": {"name": "钢结构制造", "parent": "548"},
"884": {"name": "投影机", "parent": "606"},
"885": {"name": "啤酒销售", "parent": "594"},
"886": {"name": "度假村旅游", "parent": "657"},
"887": {"name": "电力元件设备", "parent": "626"},
"888": {"name": "管理软件", "parent": "414"},
"889": {"name": "轴承", "parent": "628"},
"890": {"name": "餐饮设备", "parent": "656"},
"891": {"name": "肉制品及副产品加工", "parent": "593"},
"892": {"name": "艺术收藏品投资交易", "parent": "584"},
"893": {"name": "净水器", "parent": "601"},
"894": {"name": "进口食品", "parent": "593"},
"895": {"name": "娱乐文化传播", "parent": "585"},
"896": {"name": "文化传播", "parent": "585"},
"897": {"name": "商旅传媒", "parent": "580"},
"898": {"name": "广告设计制作", "parent": "579"},
"899": {"name": "金属丝绳及其制品制造", "parent": "627"},
"900": {"name": "建筑涂料", "parent": "627"},
"901": {"name": "抵押贷款", "parent": "543"},
"902": {"name": "早教", "parent": "570"},
"903": {"name": "电影放映", "parent": "583"},
"904": {"name": "内衣服饰", "parent": "597"},
"905": {"name": "无线网络通信", "parent": "418"},
"906": {"name": "记忆卡", "parent": "415"},
"907": {"name": "女装服饰", "parent": "597"},
"908": {"name": "建筑机械", "parent": "620"},
"909": {"name": "制冷电器", "parent": "601"},
"910": {"name": "通信设备", "parent": "417"},
"911": {"name": "空调设备", "parent": "601"},
"912": {"name": "建筑装饰", "parent": "553"},
"913": {"name": "办公设备", "parent": "603"},
"916": {"name": "数据处理软件", "parent": "414"},
"917": {"name": "葡萄酒贸易", "parent": "594"},
"918": {"name": "通讯器材", "parent": "417"},
"919": {"name": "铜业", "parent": "633"},
"920": {"name": "食堂", "parent": "656"},
"921": {"name": "糖果零食", "parent": "593"},
"922": {"name": "文化艺术传播", "parent": "584"},
"923": {"name": "太阳能电器", "parent": "601"},
"924": {"name": "药品零售", "parent": "645"},
"925": {"name": "果蔬食品", "parent": "593"},
"926": {"name": "文化活动策划", "parent": "585"},
"928": {"name": "汽车广告", "parent": "657"},
"929": {"name": "条码设备", "parent": "630"},
"930": {"name": "建筑石材", "parent": "627"},
"931": {"name": "贵金属", "parent": "545"},
"932": {"name": "体育", "parent": "660"},
"933": {"name": "金融信息服务", "parent": "414"},
"934": {"name": "玻璃建材", "parent": "627"},
"935": {"name": "家教", "parent": "569"},
"936": {"name": "歌舞厅娱乐活动", "parent": "586"},
"937": {"name": "计算机服务器", "parent": "415"},
"938": {"name": "管道", "parent": "627"},
"939": {"name": "婴幼儿服饰", "parent": "597"},
"940": {"name": "热水器", "parent": "601"},
"941": {"name": "计算机及零部件制造", "parent": "415"},
"942": {"name": "钢铁贸易", "parent": "633"},
"944": {"name": "包装材料", "parent": "632"},
"945": {"name": "计算机办公设备", "parent": "603"},
"946": {"name": "白酒", "parent": "594"},
"948": {"name": "发动机", "parent": "620"},
"949": {"name": "快餐服务", "parent": "656"},
"950": {"name": "酒类销售", "parent": "594"},
"951": {"name": "电子产品、机电设备", "parent": "626"},
"952": {"name": "激光设备", "parent": "626"},
"953": {"name": "餐饮策划", "parent": "656"},
"954": {"name": "饮料、食品", "parent": "594"},
"955": {"name": "文化娱乐经纪", "parent": "585"},
"956": {"name": "天然气", "parent": "665"},
"957": {"name": "农副食品", "parent": "593"},
"958": {"name": "艺术表演", "parent": "585"},
"959": {"name": "石膏、水泥制品及类似制品制造", "parent": "627"},
"960": {"name": "橱柜", "parent": "602"},
"961": {"name": "管理培训", "parent": "577"},
"962": {"name": "男装服饰", "parent": "597"},
"963": {"name": "化肥制造", "parent": "675"},
"964": {"name": "童装服饰", "parent": "597"},
"965": {"name": "电源电池", "parent": "626"},
"966": {"name": "家电维修", "parent": "664"},
"967": {"name": "光电子器件", "parent": "419"},
"968": {"name": "旅行社服务", "parent": "657"},
"969": {"name": "电线、电缆制造", "parent": "626"},
"970": {"name": "软件开发、信息系统集成", "parent": "419"},
"971": {"name": "白酒制造", "parent": "594"},
"973": {"name": "甜品服务", "parent": "656"},
"974": {"name": "糕点、面包制造", "parent": "593"},
"975": {"name": "木工机械", "parent": "620"},
"976": {"name": "酒吧服务", "parent": "656"},
"977": {"name": "火腿肠", "parent": "593"},
"978": {"name": "广告策划推广", "parent": "579"},
"979": {"name": "新能源产品和生产装备制造", "parent": "667"},
"980": {"name": "调味品", "parent": "593"},
"981": {"name": "礼仪表演", "parent": "585"},
"982": {"name": "劳务派遣", "parent": "560"},
"983": {"name": "建材零售", "parent": "627"},
"984": {"name": "商品交易中心", "parent": "545"},
"985": {"name": "体育推广", "parent": "585"},
"986": {"name": "茶饮料及其他饮料制造", "parent": "594"},
"987": {"name": "金属建材", "parent": "627"},
"988": {"name": "职业技能培训", "parent": "571"},
"989": {"name": "网吧活动", "parent": "586"},
"990": {"name": "洗衣服务", "parent": "658"},
"991": {"name": "管道工程", "parent": "554"},
"992": {"name": "通信工程", "parent": "417"},
"993": {"name": "电子元器件", "parent": "626"},
"994": {"name": "电子设备", "parent": "419"},
"995": {"name": "茶馆服务", "parent": "656"},
"996": {"name": "旅游开发", "parent": "657"},
"997": {"name": "视频通讯", "parent": "417"},
"998": {"name": "白酒销售", "parent": "594"},
"1000": {"name": "咖啡馆服务", "parent": "656"},
"1001": {"name": "食品零售", "parent": "593"},
"1002": {"name": "健康疗养旅游", "parent": "655"},
"1003": {"name": "粮油食品", "parent": "593"},
"1004": {"name": "儿童教育影视", "parent": "583"},
"1005": {"name": "新能源发电", "parent": "667"},
"1006": {"name": "旅游策划", "parent": "657"},
"1007": {"name": "绘画", "parent": "575"},
"1008": {"name": "方便面及其他方便食品", "parent": "593"},
"1009": {"name": "房地产经纪", "parent": "550"},
"1010": {"name": "母婴家政", "parent": "661"},
"1011": {"name": "居家养老健康服务", "parent": "661"},
"1012": {"name": "文化艺术投资", "parent": "545"},
"1013": {"name": "运动健身", "parent": "660"},
"1014": {"name": "瓶(罐)装饮用水制造", "parent": "594"},
"1015": {"name": "金属门窗", "parent": "627"},
"1016": {"name": "机动车检测", "parent": "563"},
"1017": {"name": "货物运输", "parent": "634"},
"1018": {"name": "服饰专卖", "parent": "690"},
"1019": {"name": "酒店服装", "parent": "597"},
"1020": {"name": "通讯软件", "parent": "417"},
"1021": {"name": "消防工程", "parent": "554"},
"1022": {"name": "嵌入式电子系统", "parent": "419"},
"1023": {"name": "航空票务", "parent": "636"},
"1024": {"name": "电气设备", "parent": "626"},
"1025": {"name": "酒业贸易", "parent": "594"},
"1027": {"name": "其他饮料及冷饮服务", "parent": "656"},
"1028": {"name": "乳制品", "parent": "593"},
"1029": {"name": "新闻期刊出版", "parent": "588"},
"1030": {"name": "水污染治理", "parent": "672"},
"1031": {"name": "谷物食品", "parent": "593"},
"1032": {"name": "数字动漫设计制造服务", "parent": "590"},
"1033": {"name": "医院", "parent": "646"},
"1034": {"name": "旅游广告", "parent": "657"},
"1035": {"name": "办公家具", "parent": "602"},
"1036": {"name": "房地产营销策划", "parent": "550"},
"1037": {"name": "保洁家政", "parent": "661"},
"1038": {"name": "水泥制造", "parent": "627"},
"1039": {"name": "市场研究咨询", "parent": "567"},
"1040": {"name": "驾校", "parent": "571"},
"1041": {"name": "正餐服务", "parent": "656"},
"1043": {"name": "机动车燃油", "parent": "665"},
"1044": {"name": "食品", "parent": "593"},
"1045": {"name": "新能源汽车", "parent": "629"},
"1046": {"name": "手机无线网络推广", "parent": "417"},
"1047": {"name": "环保设备", "parent": "672"},
"1048": {"name": "通讯工程", "parent": "418"},
"1049": {"name": "半导体集成电路", "parent": "419"},
"1050": {"name": "航空服务", "parent": "636"},
"1051": {"name": "电机设备", "parent": "626"},
"1052": {"name": "档案软件", "parent": "414"},
"1053": {"name": "冷链物流服务", "parent": "634"},
"1054": {"name": "小吃服务", "parent": "656"},
"1055": {"name": "水产品加工", "parent": "593"},
"1056": {"name": "图书出版", "parent": "588"},
"1057": {"name": "固体废物治理", "parent": "672"},
"1059": {"name": "坚果食品", "parent": "593"},
"1060": {"name": "广告传媒", "parent": "579"},
"1061": {"name": "电梯", "parent": "622"},
"1062": {"name": "社区医疗与卫生院", "parent": "646"},
"1063": {"name": "广告、印刷包装", "parent": "630"},
"1064": {"name": "婚纱礼服", "parent": "662"},
"1065": {"name": "地毯", "parent": "602"},
"1066": {"name": "互联网物业", "parent": "551"},
"1067": {"name": "跨境电商", "parent": "3"},
"1068": {"name": "信息安全、系统集成", "parent": "9"},
"1069": {"name": "专用汽车制造", "parent": "750"},
"1070": {"name": "商品贸易", "parent": "3"},
"1071": {"name": "墙壁装饰材料", "parent": "746"},
"1072": {"name": "窗帘装饰材料", "parent": "746"},
"1073": {"name": "电子商务、本地生活服务", "parent": "3"},
"1075": {"name": "白酒电子商务", "parent": "3"},
"1076": {"name": "商品贸易、电子商务", "parent": "3"},
"1077": {"name": "木质装饰材料", "parent": "746"},
"1078": {"name": "电子商务、汽车电商交易平台", "parent": "3"},
"1079": {"name": "汽车轮胎", "parent": "751"},
"1080": {"name": "气体压缩机械制造", "parent": "732"},
"1081": {"name": "家装家具电子商务", "parent": "3"},
"1082": {"name": "化妆品电子商务", "parent": "3"},
"1083": {"name": "汽车销售", "parent": "749"},
"1084": {"name": "新闻资讯网站", "parent": "510"},
"1085": {"name": "母婴电商", "parent": "3"},
"1086": {"name": "电商商务、收藏品交易", "parent": "3"},
"1088": {"name": "电子商务、数码产品", "parent": "3"},
"1089": {"name": "二手车交易", "parent": "749"},
"1090": {"name": "游戏制作服务", "parent": "5"},
"1091": {"name": "母婴服务", "parent": "510"},
"1092": {"name": "家具电子商务", "parent": "3"},
"1093": {"name": "汽车配件电子商务", "parent": "3"},
"1094": {"name": "输配电设备", "parent": "777"},
"1095": {"name": "矿山设备", "parent": "727"},
"1096": {"name": "机床机械", "parent": "726"},
"1097": {"name": "农产品电商", "parent": "3"},
"1098": {"name": "陶瓷装饰材料", "parent": "746"},
"1099": {"name": "车载联网设备", "parent": "487"},
"1100": {"name": "汽车销售电子商务", "parent": "3"},
"1101": {"name": "石油设备", "parent": "730"},
"1102": {"name": "智能家居", "parent": "487"},
"1103": {"name": "散热器", "parent": "751"},
"1104": {"name": "电力工程", "parent": "775"},
"1105": {"name": "生鲜电商", "parent": "3"},
"1106": {"name": "互联网数据服务", "parent": "490"},
"1107": {"name": "房车、商务车销售", "parent": "749"},
"1108": {"name": "茶叶电子商务", "parent": "3"},
"1109": {"name": "酒类电子商务", "parent": "3"},
"1110": {"name": "阀门", "parent": "730"},
"1111": {"name": "食品电商", "parent": "3"},
"1112": {"name": "儿童摄影", "parent": "871"},
"1113": {"name": "广告摄影", "parent": "871"},
"1114": {"name": "婚纱摄影", "parent": "871"},
"1115": {"name": "模具制造", "parent": "620"},
"1116": {"name": "汽车模具", "parent": "629"},
"1117": {"name": "认证咨询", "parent": "567"},
"1118": {"name": "数字视觉制作服务", "parent": "590"},
"1119": {"name": "牙科及医疗器械", "parent": "646"},
"1120": {"name": "猎头招聘", "parent": "560"},
"1121": {"name": "家居", "parent": "518"},
"1122": {"name": "收藏品", "parent": "518"},
"1123": {"name": "首饰", "parent": "518"},
"1124": {"name": "工艺品", "parent": "518"},
"1125": {"name": "财务", "parent": "515"},
"1126": {"name": "税务", "parent": "515"},
"1127": {"name": "分类信息", "parent": "2"},
"1128": {"name": "宠物", "parent": "0"},
"1129": {"name": "快消品", "parent": "518"},
"1130": {"name": "人工智能", "parent": "2"},
"1131": {"name": "农/林/牧/渔", "parent": "0"},
}
def get_names(id):
id = str(id)
nms = []
d = TBL.get(id)
if not d:
return []
nms.append(d["name"])
p = get_names(d["parent"])
if p:
nms.extend(p)
return nms
if __name__ == "__main__":
print(get_names("1119"))

View File

@ -1,789 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
TBL = {
"2": {"name": "北京", "parent": "1"},
"3": {"name": "天津", "parent": "1"},
"4": {"name": "河北", "parent": "1"},
"5": {"name": "山西", "parent": "1"},
"6": {"name": "内蒙古", "parent": "1"},
"7": {"name": "辽宁", "parent": "1"},
"8": {"name": "吉林", "parent": "1"},
"9": {"name": "黑龙江", "parent": "1"},
"10": {"name": "上海", "parent": "1"},
"11": {"name": "江苏", "parent": "1"},
"12": {"name": "浙江", "parent": "1"},
"13": {"name": "安徽", "parent": "1"},
"14": {"name": "福建", "parent": "1"},
"15": {"name": "江西", "parent": "1"},
"16": {"name": "山东", "parent": "1"},
"17": {"name": "河南", "parent": "1"},
"18": {"name": "湖北", "parent": "1"},
"19": {"name": "湖南", "parent": "1"},
"20": {"name": "广东", "parent": "1"},
"21": {"name": "广西", "parent": "1"},
"22": {"name": "海南", "parent": "1"},
"23": {"name": "重庆", "parent": "1"},
"24": {"name": "四川", "parent": "1"},
"25": {"name": "贵州", "parent": "1"},
"26": {"name": "云南", "parent": "1"},
"27": {"name": "西藏", "parent": "1"},
"28": {"name": "陕西", "parent": "1"},
"29": {"name": "甘肃", "parent": "1"},
"30": {"name": "青海", "parent": "1"},
"31": {"name": "宁夏", "parent": "1"},
"32": {"name": "新疆", "parent": "1"},
"33": {"name": "北京市", "parent": "2"},
"34": {"name": "天津市", "parent": "3"},
"35": {"name": "石家庄市", "parent": "4"},
"36": {"name": "唐山市", "parent": "4"},
"37": {"name": "秦皇岛市", "parent": "4"},
"38": {"name": "邯郸市", "parent": "4"},
"39": {"name": "邢台市", "parent": "4"},
"40": {"name": "保定市", "parent": "4"},
"41": {"name": "张家口市", "parent": "4"},
"42": {"name": "承德市", "parent": "4"},
"43": {"name": "沧州市", "parent": "4"},
"44": {"name": "廊坊市", "parent": "4"},
"45": {"name": "衡水市", "parent": "4"},
"46": {"name": "太原市", "parent": "5"},
"47": {"name": "大同市", "parent": "5"},
"48": {"name": "阳泉市", "parent": "5"},
"49": {"name": "长治市", "parent": "5"},
"50": {"name": "晋城市", "parent": "5"},
"51": {"name": "朔州市", "parent": "5"},
"52": {"name": "晋中市", "parent": "5"},
"53": {"name": "运城市", "parent": "5"},
"54": {"name": "忻州市", "parent": "5"},
"55": {"name": "临汾市", "parent": "5"},
"56": {"name": "吕梁市", "parent": "5"},
"57": {"name": "呼和浩特市", "parent": "6"},
"58": {"name": "包头市", "parent": "6"},
"59": {"name": "乌海市", "parent": "6"},
"60": {"name": "赤峰市", "parent": "6"},
"61": {"name": "通辽市", "parent": "6"},
"62": {"name": "鄂尔多斯市", "parent": "6"},
"63": {"name": "呼伦贝尔市", "parent": "6"},
"64": {"name": "巴彦淖尔市", "parent": "6"},
"65": {"name": "乌兰察布市", "parent": "6"},
"66": {"name": "兴安盟", "parent": "6"},
"67": {"name": "锡林郭勒盟", "parent": "6"},
"68": {"name": "阿拉善盟", "parent": "6"},
"69": {"name": "沈阳市", "parent": "7"},
"70": {"name": "大连市", "parent": "7"},
"71": {"name": "鞍山市", "parent": "7"},
"72": {"name": "抚顺市", "parent": "7"},
"73": {"name": "本溪市", "parent": "7"},
"74": {"name": "丹东市", "parent": "7"},
"75": {"name": "锦州市", "parent": "7"},
"76": {"name": "营口市", "parent": "7"},
"77": {"name": "阜新市", "parent": "7"},
"78": {"name": "辽阳市", "parent": "7"},
"79": {"name": "盘锦市", "parent": "7"},
"80": {"name": "铁岭市", "parent": "7"},
"81": {"name": "朝阳市", "parent": "7"},
"82": {"name": "葫芦岛市", "parent": "7"},
"83": {"name": "长春市", "parent": "8"},
"84": {"name": "吉林市", "parent": "8"},
"85": {"name": "四平市", "parent": "8"},
"86": {"name": "辽源市", "parent": "8"},
"87": {"name": "通化市", "parent": "8"},
"88": {"name": "白山市", "parent": "8"},
"89": {"name": "松原市", "parent": "8"},
"90": {"name": "白城市", "parent": "8"},
"91": {"name": "延边朝鲜族自治州", "parent": "8"},
"92": {"name": "哈尔滨市", "parent": "9"},
"93": {"name": "齐齐哈尔市", "parent": "9"},
"94": {"name": "鸡西市", "parent": "9"},
"95": {"name": "鹤岗市", "parent": "9"},
"96": {"name": "双鸭山市", "parent": "9"},
"97": {"name": "大庆市", "parent": "9"},
"98": {"name": "伊春市", "parent": "9"},
"99": {"name": "佳木斯市", "parent": "9"},
"100": {"name": "七台河市", "parent": "9"},
"101": {"name": "牡丹江市", "parent": "9"},
"102": {"name": "黑河市", "parent": "9"},
"103": {"name": "绥化市", "parent": "9"},
"104": {"name": "大兴安岭地区", "parent": "9"},
"105": {"name": "上海市", "parent": "10"},
"106": {"name": "南京市", "parent": "11"},
"107": {"name": "无锡市", "parent": "11"},
"108": {"name": "徐州市", "parent": "11"},
"109": {"name": "常州市", "parent": "11"},
"110": {"name": "苏州市", "parent": "11"},
"111": {"name": "南通市", "parent": "11"},
"112": {"name": "连云港市", "parent": "11"},
"113": {"name": "淮安市", "parent": "11"},
"114": {"name": "盐城市", "parent": "11"},
"115": {"name": "扬州市", "parent": "11"},
"116": {"name": "镇江市", "parent": "11"},
"117": {"name": "泰州市", "parent": "11"},
"118": {"name": "宿迁市", "parent": "11"},
"119": {"name": "杭州市", "parent": "12"},
"120": {"name": "宁波市", "parent": "12"},
"121": {"name": "温州市", "parent": "12"},
"122": {"name": "嘉兴市", "parent": "12"},
"123": {"name": "湖州市", "parent": "12"},
"124": {"name": "绍兴市", "parent": "12"},
"125": {"name": "金华市", "parent": "12"},
"126": {"name": "衢州市", "parent": "12"},
"127": {"name": "舟山市", "parent": "12"},
"128": {"name": "台州市", "parent": "12"},
"129": {"name": "丽水市", "parent": "12"},
"130": {"name": "合肥市", "parent": "13"},
"131": {"name": "芜湖市", "parent": "13"},
"132": {"name": "蚌埠市", "parent": "13"},
"133": {"name": "淮南市", "parent": "13"},
"134": {"name": "马鞍山市", "parent": "13"},
"135": {"name": "淮北市", "parent": "13"},
"136": {"name": "铜陵市", "parent": "13"},
"137": {"name": "安庆市", "parent": "13"},
"138": {"name": "黄山市", "parent": "13"},
"139": {"name": "滁州市", "parent": "13"},
"140": {"name": "阜阳市", "parent": "13"},
"141": {"name": "宿州市", "parent": "13"},
"143": {"name": "六安市", "parent": "13"},
"144": {"name": "亳州市", "parent": "13"},
"145": {"name": "池州市", "parent": "13"},
"146": {"name": "宣城市", "parent": "13"},
"147": {"name": "福州市", "parent": "14"},
"148": {"name": "厦门市", "parent": "14"},
"149": {"name": "莆田市", "parent": "14"},
"150": {"name": "三明市", "parent": "14"},
"151": {"name": "泉州市", "parent": "14"},
"152": {"name": "漳州市", "parent": "14"},
"153": {"name": "南平市", "parent": "14"},
"154": {"name": "龙岩市", "parent": "14"},
"155": {"name": "宁德市", "parent": "14"},
"156": {"name": "南昌市", "parent": "15"},
"157": {"name": "景德镇市", "parent": "15"},
"158": {"name": "萍乡市", "parent": "15"},
"159": {"name": "九江市", "parent": "15"},
"160": {"name": "新余市", "parent": "15"},
"161": {"name": "鹰潭市", "parent": "15"},
"162": {"name": "赣州市", "parent": "15"},
"163": {"name": "吉安市", "parent": "15"},
"164": {"name": "宜春市", "parent": "15"},
"165": {"name": "抚州市", "parent": "15"},
"166": {"name": "上饶市", "parent": "15"},
"167": {"name": "济南市", "parent": "16"},
"168": {"name": "青岛市", "parent": "16"},
"169": {"name": "淄博市", "parent": "16"},
"170": {"name": "枣庄市", "parent": "16"},
"171": {"name": "东营市", "parent": "16"},
"172": {"name": "烟台市", "parent": "16"},
"173": {"name": "潍坊市", "parent": "16"},
"174": {"name": "济宁市", "parent": "16"},
"175": {"name": "泰安市", "parent": "16"},
"176": {"name": "威海市", "parent": "16"},
"177": {"name": "日照市", "parent": "16"},
"179": {"name": "临沂市", "parent": "16"},
"180": {"name": "德州市", "parent": "16"},
"181": {"name": "聊城市", "parent": "16"},
"182": {"name": "滨州市", "parent": "16"},
"183": {"name": "菏泽市", "parent": "16"},
"184": {"name": "郑州市", "parent": "17"},
"185": {"name": "开封市", "parent": "17"},
"186": {"name": "洛阳市", "parent": "17"},
"187": {"name": "平顶山市", "parent": "17"},
"188": {"name": "安阳市", "parent": "17"},
"189": {"name": "鹤壁市", "parent": "17"},
"190": {"name": "新乡市", "parent": "17"},
"191": {"name": "焦作市", "parent": "17"},
"192": {"name": "濮阳市", "parent": "17"},
"193": {"name": "许昌市", "parent": "17"},
"194": {"name": "漯河市", "parent": "17"},
"195": {"name": "三门峡市", "parent": "17"},
"196": {"name": "南阳市", "parent": "17"},
"197": {"name": "商丘市", "parent": "17"},
"198": {"name": "信阳市", "parent": "17"},
"199": {"name": "周口市", "parent": "17"},
"200": {"name": "驻马店市", "parent": "17"},
"201": {"name": "武汉市", "parent": "18"},
"202": {"name": "黄石市", "parent": "18"},
"203": {"name": "十堰市", "parent": "18"},
"204": {"name": "宜昌市", "parent": "18"},
"205": {"name": "襄阳市", "parent": "18"},
"206": {"name": "鄂州市", "parent": "18"},
"207": {"name": "荆门市", "parent": "18"},
"208": {"name": "孝感市", "parent": "18"},
"209": {"name": "荆州市", "parent": "18"},
"210": {"name": "黄冈市", "parent": "18"},
"211": {"name": "咸宁市", "parent": "18"},
"212": {"name": "随州市", "parent": "18"},
"213": {"name": "恩施土家族苗族自治州", "parent": "18"},
"215": {"name": "长沙市", "parent": "19"},
"216": {"name": "株洲市", "parent": "19"},
"217": {"name": "湘潭市", "parent": "19"},
"218": {"name": "衡阳市", "parent": "19"},
"219": {"name": "邵阳市", "parent": "19"},
"220": {"name": "岳阳市", "parent": "19"},
"221": {"name": "常德市", "parent": "19"},
"222": {"name": "张家界市", "parent": "19"},
"223": {"name": "益阳市", "parent": "19"},
"224": {"name": "郴州市", "parent": "19"},
"225": {"name": "永州市", "parent": "19"},
"226": {"name": "怀化市", "parent": "19"},
"227": {"name": "娄底市", "parent": "19"},
"228": {"name": "湘西土家族苗族自治州", "parent": "19"},
"229": {"name": "广州市", "parent": "20"},
"230": {"name": "韶关市", "parent": "20"},
"231": {"name": "深圳市", "parent": "20"},
"232": {"name": "珠海市", "parent": "20"},
"233": {"name": "汕头市", "parent": "20"},
"234": {"name": "佛山市", "parent": "20"},
"235": {"name": "江门市", "parent": "20"},
"236": {"name": "湛江市", "parent": "20"},
"237": {"name": "茂名市", "parent": "20"},
"238": {"name": "肇庆市", "parent": "20"},
"239": {"name": "惠州市", "parent": "20"},
"240": {"name": "梅州市", "parent": "20"},
"241": {"name": "汕尾市", "parent": "20"},
"242": {"name": "河源市", "parent": "20"},
"243": {"name": "阳江市", "parent": "20"},
"244": {"name": "清远市", "parent": "20"},
"245": {"name": "东莞市", "parent": "20"},
"246": {"name": "中山市", "parent": "20"},
"247": {"name": "潮州市", "parent": "20"},
"248": {"name": "揭阳市", "parent": "20"},
"249": {"name": "云浮市", "parent": "20"},
"250": {"name": "南宁市", "parent": "21"},
"251": {"name": "柳州市", "parent": "21"},
"252": {"name": "桂林市", "parent": "21"},
"253": {"name": "梧州市", "parent": "21"},
"254": {"name": "北海市", "parent": "21"},
"255": {"name": "防城港市", "parent": "21"},
"256": {"name": "钦州市", "parent": "21"},
"257": {"name": "贵港市", "parent": "21"},
"258": {"name": "玉林市", "parent": "21"},
"259": {"name": "百色市", "parent": "21"},
"260": {"name": "贺州市", "parent": "21"},
"261": {"name": "河池市", "parent": "21"},
"262": {"name": "来宾市", "parent": "21"},
"263": {"name": "崇左市", "parent": "21"},
"264": {"name": "海口市", "parent": "22"},
"265": {"name": "三亚市", "parent": "22"},
"267": {"name": "重庆市", "parent": "23"},
"268": {"name": "成都市", "parent": "24"},
"269": {"name": "自贡市", "parent": "24"},
"270": {"name": "攀枝花市", "parent": "24"},
"271": {"name": "泸州市", "parent": "24"},
"272": {"name": "德阳市", "parent": "24"},
"273": {"name": "绵阳市", "parent": "24"},
"274": {"name": "广元市", "parent": "24"},
"275": {"name": "遂宁市", "parent": "24"},
"276": {"name": "内江市", "parent": "24"},
"277": {"name": "乐山市", "parent": "24"},
"278": {"name": "南充市", "parent": "24"},
"279": {"name": "眉山市", "parent": "24"},
"280": {"name": "宜宾市", "parent": "24"},
"281": {"name": "广安市", "parent": "24"},
"282": {"name": "达州市", "parent": "24"},
"283": {"name": "雅安市", "parent": "24"},
"284": {"name": "巴中市", "parent": "24"},
"285": {"name": "资阳市", "parent": "24"},
"286": {"name": "阿坝藏族羌族自治州", "parent": "24"},
"287": {"name": "甘孜藏族自治州", "parent": "24"},
"288": {"name": "凉山彝族自治州", "parent": "24"},
"289": {"name": "贵阳市", "parent": "25"},
"290": {"name": "六盘水市", "parent": "25"},
"291": {"name": "遵义市", "parent": "25"},
"292": {"name": "安顺市", "parent": "25"},
"293": {"name": "铜仁市", "parent": "25"},
"294": {"name": "黔西南布依族苗族自治州", "parent": "25"},
"295": {"name": "毕节市", "parent": "25"},
"296": {"name": "黔东南苗族侗族自治州", "parent": "25"},
"297": {"name": "黔南布依族苗族自治州", "parent": "25"},
"298": {"name": "昆明市", "parent": "26"},
"299": {"name": "曲靖市", "parent": "26"},
"300": {"name": "玉溪市", "parent": "26"},
"301": {"name": "保山市", "parent": "26"},
"302": {"name": "昭通市", "parent": "26"},
"303": {"name": "丽江市", "parent": "26"},
"304": {"name": "普洱市", "parent": "26"},
"305": {"name": "临沧市", "parent": "26"},
"306": {"name": "楚雄彝族自治州", "parent": "26"},
"307": {"name": "红河哈尼族彝族自治州", "parent": "26"},
"308": {"name": "文山壮族苗族自治州", "parent": "26"},
"309": {"name": "西双版纳傣族自治州", "parent": "26"},
"310": {"name": "大理白族自治州", "parent": "26"},
"311": {"name": "德宏傣族景颇族自治州", "parent": "26"},
"312": {"name": "怒江傈僳族自治州", "parent": "26"},
"313": {"name": "迪庆藏族自治州", "parent": "26"},
"314": {"name": "拉萨市", "parent": "27"},
"315": {"name": "昌都市", "parent": "27"},
"316": {"name": "山南市", "parent": "27"},
"317": {"name": "日喀则市", "parent": "27"},
"318": {"name": "那曲市", "parent": "27"},
"319": {"name": "阿里地区", "parent": "27"},
"320": {"name": "林芝市", "parent": "27"},
"321": {"name": "西安市", "parent": "28"},
"322": {"name": "铜川市", "parent": "28"},
"323": {"name": "宝鸡市", "parent": "28"},
"324": {"name": "咸阳市", "parent": "28"},
"325": {"name": "渭南市", "parent": "28"},
"326": {"name": "延安市", "parent": "28"},
"327": {"name": "汉中市", "parent": "28"},
"328": {"name": "榆林市", "parent": "28"},
"329": {"name": "安康市", "parent": "28"},
"330": {"name": "商洛市", "parent": "28"},
"331": {"name": "兰州市", "parent": "29"},
"332": {"name": "嘉峪关市", "parent": "29"},
"333": {"name": "金昌市", "parent": "29"},
"334": {"name": "白银市", "parent": "29"},
"335": {"name": "天水市", "parent": "29"},
"336": {"name": "武威市", "parent": "29"},
"337": {"name": "张掖市", "parent": "29"},
"338": {"name": "平凉市", "parent": "29"},
"339": {"name": "酒泉市", "parent": "29"},
"340": {"name": "庆阳市", "parent": "29"},
"341": {"name": "定西市", "parent": "29"},
"342": {"name": "陇南市", "parent": "29"},
"343": {"name": "临夏回族自治州", "parent": "29"},
"344": {"name": "甘南藏族自治州", "parent": "29"},
"345": {"name": "西宁市", "parent": "30"},
"346": {"name": "海东市", "parent": "30"},
"347": {"name": "海北藏族自治州", "parent": "30"},
"348": {"name": "黄南藏族自治州", "parent": "30"},
"349": {"name": "海南藏族自治州", "parent": "30"},
"350": {"name": "果洛藏族自治州", "parent": "30"},
"351": {"name": "玉树藏族自治州", "parent": "30"},
"352": {"name": "海西蒙古族藏族自治州", "parent": "30"},
"353": {"name": "银川市", "parent": "31"},
"354": {"name": "石嘴山市", "parent": "31"},
"355": {"name": "吴忠市", "parent": "31"},
"356": {"name": "固原市", "parent": "31"},
"357": {"name": "中卫市", "parent": "31"},
"358": {"name": "乌鲁木齐市", "parent": "32"},
"359": {"name": "克拉玛依市", "parent": "32"},
"360": {"name": "吐鲁番市", "parent": "32"},
"361": {"name": "哈密市", "parent": "32"},
"362": {"name": "昌吉回族自治州", "parent": "32"},
"363": {"name": "博尔塔拉蒙古自治州", "parent": "32"},
"364": {"name": "巴音郭楞蒙古自治州", "parent": "32"},
"365": {"name": "阿克苏地区", "parent": "32"},
"366": {"name": "克孜勒苏柯尔克孜自治州", "parent": "32"},
"367": {"name": "喀什地区", "parent": "32"},
"368": {"name": "和田地区", "parent": "32"},
"369": {"name": "伊犁哈萨克自治州", "parent": "32"},
"370": {"name": "塔城地区", "parent": "32"},
"371": {"name": "阿勒泰地区", "parent": "32"},
"372": {"name": "新疆省直辖行政单位", "parent": "32"},
"373": {"name": "可克达拉市", "parent": "32"},
"374": {"name": "昆玉市", "parent": "32"},
"375": {"name": "胡杨河市", "parent": "32"},
"376": {"name": "双河市", "parent": "32"},
"3560": {"name": "北票市", "parent": "7"},
"3615": {"name": "高州市", "parent": "20"},
"3651": {"name": "济源市", "parent": "17"},
"3662": {"name": "胶南市", "parent": "16"},
"3683": {"name": "老河口市", "parent": "18"},
"3758": {"name": "沙河市", "parent": "4"},
"3822": {"name": "宜城市", "parent": "18"},
"3842": {"name": "枣阳市", "parent": "18"},
"3850": {"name": "肇东市", "parent": "9"},
"3905": {"name": "澳门", "parent": "1"},
"3906": {"name": "澳门", "parent": "3905"},
"3907": {"name": "香港", "parent": "1"},
"3908": {"name": "香港", "parent": "3907"},
"3947": {"name": "仙桃市", "parent": "18"},
"3954": {"name": "台湾", "parent": "1"},
"3955": {"name": "台湾", "parent": "3954"},
"3956": {"name": "海外", "parent": "1"},
"3957": {"name": "海外", "parent": "3956"},
"3958": {"name": "美国", "parent": "3956"},
"3959": {"name": "加拿大", "parent": "3956"},
"3961": {"name": "日本", "parent": "3956"},
"3962": {"name": "韩国", "parent": "3956"},
"3963": {"name": "德国", "parent": "3956"},
"3964": {"name": "英国", "parent": "3956"},
"3965": {"name": "意大利", "parent": "3956"},
"3966": {"name": "西班牙", "parent": "3956"},
"3967": {"name": "法国", "parent": "3956"},
"3968": {"name": "澳大利亚", "parent": "3956"},
"3969": {"name": "东城区", "parent": "2"},
"3970": {"name": "西城区", "parent": "2"},
"3971": {"name": "崇文区", "parent": "2"},
"3972": {"name": "宣武区", "parent": "2"},
"3973": {"name": "朝阳区", "parent": "2"},
"3974": {"name": "海淀区", "parent": "2"},
"3975": {"name": "丰台区", "parent": "2"},
"3976": {"name": "石景山区", "parent": "2"},
"3977": {"name": "门头沟区", "parent": "2"},
"3978": {"name": "房山区", "parent": "2"},
"3979": {"name": "通州区", "parent": "2"},
"3980": {"name": "顺义区", "parent": "2"},
"3981": {"name": "昌平区", "parent": "2"},
"3982": {"name": "大兴区", "parent": "2"},
"3983": {"name": "平谷区", "parent": "2"},
"3984": {"name": "怀柔区", "parent": "2"},
"3985": {"name": "密云区", "parent": "2"},
"3986": {"name": "延庆区", "parent": "2"},
"3987": {"name": "黄浦区", "parent": "10"},
"3988": {"name": "徐汇区", "parent": "10"},
"3989": {"name": "长宁区", "parent": "10"},
"3990": {"name": "静安区", "parent": "10"},
"3991": {"name": "普陀区", "parent": "10"},
"3992": {"name": "闸北区", "parent": "10"},
"3993": {"name": "虹口区", "parent": "10"},
"3994": {"name": "杨浦区", "parent": "10"},
"3995": {"name": "宝山区", "parent": "10"},
"3996": {"name": "闵行区", "parent": "10"},
"3997": {"name": "嘉定区", "parent": "10"},
"3998": {"name": "浦东新区", "parent": "10"},
"3999": {"name": "松江区", "parent": "10"},
"4000": {"name": "金山区", "parent": "10"},
"4001": {"name": "青浦区", "parent": "10"},
"4002": {"name": "奉贤区", "parent": "10"},
"4003": {"name": "崇明区", "parent": "10"},
"4004": {"name": "和平区", "parent": "3"},
"4005": {"name": "河东区", "parent": "3"},
"4006": {"name": "河西区", "parent": "3"},
"4007": {"name": "南开区", "parent": "3"},
"4008": {"name": "红桥区", "parent": "3"},
"4009": {"name": "河北区", "parent": "3"},
"4010": {"name": "滨海新区", "parent": "3"},
"4011": {"name": "东丽区", "parent": "3"},
"4012": {"name": "西青区", "parent": "3"},
"4013": {"name": "北辰区", "parent": "3"},
"4014": {"name": "津南区", "parent": "3"},
"4015": {"name": "武清区", "parent": "3"},
"4016": {"name": "宝坻区", "parent": "3"},
"4017": {"name": "静海区", "parent": "3"},
"4018": {"name": "宁河区", "parent": "3"},
"4019": {"name": "蓟州区", "parent": "3"},
"4020": {"name": "渝中区", "parent": "23"},
"4021": {"name": "江北区", "parent": "23"},
"4022": {"name": "南岸区", "parent": "23"},
"4023": {"name": "沙坪坝区", "parent": "23"},
"4024": {"name": "九龙坡区", "parent": "23"},
"4025": {"name": "大渡口区", "parent": "23"},
"4026": {"name": "渝北区", "parent": "23"},
"4027": {"name": "巴南区", "parent": "23"},
"4028": {"name": "北碚区", "parent": "23"},
"4029": {"name": "万州区", "parent": "23"},
"4030": {"name": "黔江区", "parent": "23"},
"4031": {"name": "永川区", "parent": "23"},
"4032": {"name": "涪陵区", "parent": "23"},
"4033": {"name": "江津区", "parent": "23"},
"4034": {"name": "合川区", "parent": "23"},
"4035": {"name": "双桥区", "parent": "23"},
"4036": {"name": "万盛区", "parent": "23"},
"4037": {"name": "荣昌区", "parent": "23"},
"4038": {"name": "大足区", "parent": "23"},
"4039": {"name": "璧山区", "parent": "23"},
"4040": {"name": "铜梁区", "parent": "23"},
"4041": {"name": "潼南区", "parent": "23"},
"4042": {"name": "綦江区", "parent": "23"},
"4043": {"name": "忠县", "parent": "23"},
"4044": {"name": "开州区", "parent": "23"},
"4045": {"name": "云阳县", "parent": "23"},
"4046": {"name": "梁平区", "parent": "23"},
"4047": {"name": "垫江县", "parent": "23"},
"4048": {"name": "丰都县", "parent": "23"},
"4049": {"name": "奉节县", "parent": "23"},
"4050": {"name": "巫山县", "parent": "23"},
"4051": {"name": "巫溪县", "parent": "23"},
"4052": {"name": "城口县", "parent": "23"},
"4053": {"name": "武隆区", "parent": "23"},
"4054": {"name": "石柱土家族自治县", "parent": "23"},
"4055": {"name": "秀山土家族苗族自治县", "parent": "23"},
"4056": {"name": "酉阳土家族苗族自治县", "parent": "23"},
"4057": {"name": "彭水苗族土家族自治县", "parent": "23"},
"4058": {"name": "潜江市", "parent": "18"},
"4059": {"name": "三沙市", "parent": "22"},
"4060": {"name": "石河子市", "parent": "32"},
"4061": {"name": "阿拉尔市", "parent": "32"},
"4062": {"name": "图木舒克市", "parent": "32"},
"4063": {"name": "五家渠市", "parent": "32"},
"4064": {"name": "北屯市", "parent": "32"},
"4065": {"name": "铁门关市", "parent": "32"},
"4066": {"name": "儋州市", "parent": "22"},
"4067": {"name": "五指山市", "parent": "22"},
"4068": {"name": "文昌市", "parent": "22"},
"4069": {"name": "琼海市", "parent": "22"},
"4070": {"name": "万宁市", "parent": "22"},
"4072": {"name": "定安县", "parent": "22"},
"4073": {"name": "屯昌县", "parent": "22"},
"4074": {"name": "澄迈县", "parent": "22"},
"4075": {"name": "临高县", "parent": "22"},
"4076": {"name": "琼中黎族苗族自治县", "parent": "22"},
"4077": {"name": "保亭黎族苗族自治县", "parent": "22"},
"4078": {"name": "白沙黎族自治县", "parent": "22"},
"4079": {"name": "昌江黎族自治县", "parent": "22"},
"4080": {"name": "乐东黎族自治县", "parent": "22"},
"4081": {"name": "陵水黎族自治县", "parent": "22"},
"4082": {"name": "马来西亚", "parent": "3956"},
"6047": {"name": "长寿区", "parent": "23"},
"6857": {"name": "阿富汗", "parent": "3956"},
"6858": {"name": "阿尔巴尼亚", "parent": "3956"},
"6859": {"name": "阿尔及利亚", "parent": "3956"},
"6860": {"name": "美属萨摩亚", "parent": "3956"},
"6861": {"name": "安道尔", "parent": "3956"},
"6862": {"name": "安哥拉", "parent": "3956"},
"6863": {"name": "安圭拉", "parent": "3956"},
"6864": {"name": "南极洲", "parent": "3956"},
"6865": {"name": "安提瓜和巴布达", "parent": "3956"},
"6866": {"name": "阿根廷", "parent": "3956"},
"6867": {"name": "亚美尼亚", "parent": "3956"},
"6869": {"name": "奥地利", "parent": "3956"},
"6870": {"name": "阿塞拜疆", "parent": "3956"},
"6871": {"name": "巴哈马", "parent": "3956"},
"6872": {"name": "巴林", "parent": "3956"},
"6873": {"name": "孟加拉国", "parent": "3956"},
"6874": {"name": "巴巴多斯", "parent": "3956"},
"6875": {"name": "白俄罗斯", "parent": "3956"},
"6876": {"name": "比利时", "parent": "3956"},
"6877": {"name": "伯利兹", "parent": "3956"},
"6878": {"name": "贝宁", "parent": "3956"},
"6879": {"name": "百慕大", "parent": "3956"},
"6880": {"name": "不丹", "parent": "3956"},
"6881": {"name": "玻利维亚", "parent": "3956"},
"6882": {"name": "波黑", "parent": "3956"},
"6883": {"name": "博茨瓦纳", "parent": "3956"},
"6884": {"name": "布维岛", "parent": "3956"},
"6885": {"name": "巴西", "parent": "3956"},
"6886": {"name": "英属印度洋领土", "parent": "3956"},
"6887": {"name": "文莱", "parent": "3956"},
"6888": {"name": "保加利亚", "parent": "3956"},
"6889": {"name": "布基纳法索", "parent": "3956"},
"6890": {"name": "布隆迪", "parent": "3956"},
"6891": {"name": "柬埔寨", "parent": "3956"},
"6892": {"name": "喀麦隆", "parent": "3956"},
"6893": {"name": "佛得角", "parent": "3956"},
"6894": {"name": "开曼群岛", "parent": "3956"},
"6895": {"name": "中非", "parent": "3956"},
"6896": {"name": "乍得", "parent": "3956"},
"6897": {"name": "智利", "parent": "3956"},
"6898": {"name": "圣诞岛", "parent": "3956"},
"6899": {"name": "科科斯(基林)群岛", "parent": "3956"},
"6900": {"name": "哥伦比亚", "parent": "3956"},
"6901": {"name": "科摩罗", "parent": "3956"},
"6902": {"name": "刚果(布)", "parent": "3956"},
"6903": {"name": "刚果(金)", "parent": "3956"},
"6904": {"name": "库克群岛", "parent": "3956"},
"6905": {"name": "哥斯达黎加", "parent": "3956"},
"6906": {"name": "科特迪瓦", "parent": "3956"},
"6907": {"name": "克罗地亚", "parent": "3956"},
"6908": {"name": "古巴", "parent": "3956"},
"6909": {"name": "塞浦路斯", "parent": "3956"},
"6910": {"name": "捷克", "parent": "3956"},
"6911": {"name": "丹麦", "parent": "3956"},
"6912": {"name": "吉布提", "parent": "3956"},
"6913": {"name": "多米尼克", "parent": "3956"},
"6914": {"name": "多米尼加共和国", "parent": "3956"},
"6915": {"name": "东帝汶", "parent": "3956"},
"6916": {"name": "厄瓜多尔", "parent": "3956"},
"6917": {"name": "埃及", "parent": "3956"},
"6918": {"name": "萨尔瓦多", "parent": "3956"},
"6919": {"name": "赤道几内亚", "parent": "3956"},
"6920": {"name": "厄立特里亚", "parent": "3956"},
"6921": {"name": "爱沙尼亚", "parent": "3956"},
"6922": {"name": "埃塞俄比亚", "parent": "3956"},
"6923": {"name": "福克兰群岛(马尔维纳斯)", "parent": "3956"},
"6924": {"name": "法罗群岛", "parent": "3956"},
"6925": {"name": "斐济", "parent": "3956"},
"6926": {"name": "芬兰", "parent": "3956"},
"6927": {"name": "法属圭亚那", "parent": "3956"},
"6928": {"name": "法属波利尼西亚", "parent": "3956"},
"6929": {"name": "法属南部领土", "parent": "3956"},
"6930": {"name": "加蓬", "parent": "3956"},
"6931": {"name": "冈比亚", "parent": "3956"},
"6932": {"name": "格鲁吉亚", "parent": "3956"},
"6933": {"name": "加纳", "parent": "3956"},
"6934": {"name": "直布罗陀", "parent": "3956"},
"6935": {"name": "希腊", "parent": "3956"},
"6936": {"name": "格陵兰", "parent": "3956"},
"6937": {"name": "格林纳达", "parent": "3956"},
"6938": {"name": "瓜德罗普", "parent": "3956"},
"6939": {"name": "关岛", "parent": "3956"},
"6940": {"name": "危地马拉", "parent": "3956"},
"6941": {"name": "几内亚", "parent": "3956"},
"6942": {"name": "几内亚比绍", "parent": "3956"},
"6943": {"name": "圭亚那", "parent": "3956"},
"6944": {"name": "海地", "parent": "3956"},
"6945": {"name": "赫德岛和麦克唐纳岛", "parent": "3956"},
"6946": {"name": "洪都拉斯", "parent": "3956"},
"6947": {"name": "匈牙利", "parent": "3956"},
"6948": {"name": "冰岛", "parent": "3956"},
"6949": {"name": "印度", "parent": "3956"},
"6950": {"name": "印度尼西亚", "parent": "3956"},
"6951": {"name": "伊朗", "parent": "3956"},
"6952": {"name": "伊拉克", "parent": "3956"},
"6953": {"name": "爱尔兰", "parent": "3956"},
"6954": {"name": "以色列", "parent": "3956"},
"6955": {"name": "牙买加", "parent": "3956"},
"6956": {"name": "约旦", "parent": "3956"},
"6957": {"name": "哈萨克斯坦", "parent": "3956"},
"6958": {"name": "肯尼亚", "parent": "3956"},
"6959": {"name": "基里巴斯", "parent": "3956"},
"6960": {"name": "朝鲜", "parent": "3956"},
"6961": {"name": "科威特", "parent": "3956"},
"6962": {"name": "吉尔吉斯斯坦", "parent": "3956"},
"6963": {"name": "老挝", "parent": "3956"},
"6964": {"name": "拉脱维亚", "parent": "3956"},
"6965": {"name": "黎巴嫩", "parent": "3956"},
"6966": {"name": "莱索托", "parent": "3956"},
"6967": {"name": "利比里亚", "parent": "3956"},
"6968": {"name": "利比亚", "parent": "3956"},
"6969": {"name": "列支敦士登", "parent": "3956"},
"6970": {"name": "立陶宛", "parent": "3956"},
"6971": {"name": "卢森堡", "parent": "3956"},
"6972": {"name": "前南马其顿", "parent": "3956"},
"6973": {"name": "马达加斯加", "parent": "3956"},
"6974": {"name": "马拉维", "parent": "3956"},
"6975": {"name": "马尔代夫", "parent": "3956"},
"6976": {"name": "马里", "parent": "3956"},
"6977": {"name": "马耳他", "parent": "3956"},
"6978": {"name": "马绍尔群岛", "parent": "3956"},
"6979": {"name": "马提尼克", "parent": "3956"},
"6980": {"name": "毛里塔尼亚", "parent": "3956"},
"6981": {"name": "毛里求斯", "parent": "3956"},
"6982": {"name": "马约特", "parent": "3956"},
"6983": {"name": "墨西哥", "parent": "3956"},
"6984": {"name": "密克罗尼西亚联邦", "parent": "3956"},
"6985": {"name": "摩尔多瓦", "parent": "3956"},
"6986": {"name": "摩纳哥", "parent": "3956"},
"6987": {"name": "蒙古", "parent": "3956"},
"6988": {"name": "蒙特塞拉特", "parent": "3956"},
"6989": {"name": "摩洛哥", "parent": "3956"},
"6990": {"name": "莫桑比克", "parent": "3956"},
"6991": {"name": "缅甸", "parent": "3956"},
"6992": {"name": "纳米比亚", "parent": "3956"},
"6993": {"name": "瑙鲁", "parent": "3956"},
"6994": {"name": "尼泊尔", "parent": "3956"},
"6995": {"name": "荷兰", "parent": "3956"},
"6996": {"name": "荷属安的列斯", "parent": "3956"},
"6997": {"name": "新喀里多尼亚", "parent": "3956"},
"6998": {"name": "新西兰", "parent": "3956"},
"6999": {"name": "尼加拉瓜", "parent": "3956"},
"7000": {"name": "尼日尔", "parent": "3956"},
"7001": {"name": "尼日利亚", "parent": "3956"},
"7002": {"name": "纽埃", "parent": "3956"},
"7003": {"name": "诺福克岛", "parent": "3956"},
"7004": {"name": "北马里亚纳", "parent": "3956"},
"7005": {"name": "挪威", "parent": "3956"},
"7006": {"name": "阿曼", "parent": "3956"},
"7007": {"name": "巴基斯坦", "parent": "3956"},
"7008": {"name": "帕劳", "parent": "3956"},
"7009": {"name": "巴勒斯坦", "parent": "3956"},
"7010": {"name": "巴拿马", "parent": "3956"},
"7011": {"name": "巴布亚新几内亚", "parent": "3956"},
"7012": {"name": "巴拉圭", "parent": "3956"},
"7013": {"name": "秘鲁", "parent": "3956"},
"7014": {"name": "菲律宾", "parent": "3956"},
"7015": {"name": "皮特凯恩群岛", "parent": "3956"},
"7016": {"name": "波兰", "parent": "3956"},
"7017": {"name": "葡萄牙", "parent": "3956"},
"7018": {"name": "波多黎各", "parent": "3956"},
"7019": {"name": "卡塔尔", "parent": "3956"},
"7020": {"name": "留尼汪", "parent": "3956"},
"7021": {"name": "罗马尼亚", "parent": "3956"},
"7022": {"name": "俄罗斯联邦", "parent": "3956"},
"7023": {"name": "卢旺达", "parent": "3956"},
"7024": {"name": "圣赫勒拿", "parent": "3956"},
"7025": {"name": "圣基茨和尼维斯", "parent": "3956"},
"7026": {"name": "圣卢西亚", "parent": "3956"},
"7027": {"name": "圣皮埃尔和密克隆", "parent": "3956"},
"7028": {"name": "圣文森特和格林纳丁斯", "parent": "3956"},
"7029": {"name": "萨摩亚", "parent": "3956"},
"7030": {"name": "圣马力诺", "parent": "3956"},
"7031": {"name": "圣多美和普林西比", "parent": "3956"},
"7032": {"name": "沙特阿拉伯", "parent": "3956"},
"7033": {"name": "塞内加尔", "parent": "3956"},
"7034": {"name": "塞舌尔", "parent": "3956"},
"7035": {"name": "塞拉利昂", "parent": "3956"},
"7036": {"name": "新加坡", "parent": "3956"},
"7037": {"name": "斯洛伐克", "parent": "3956"},
"7038": {"name": "斯洛文尼亚", "parent": "3956"},
"7039": {"name": "所罗门群岛", "parent": "3956"},
"7040": {"name": "索马里", "parent": "3956"},
"7041": {"name": "南非", "parent": "3956"},
"7042": {"name": "南乔治亚岛和南桑德韦奇岛", "parent": "3956"},
"7043": {"name": "斯里兰卡", "parent": "3956"},
"7044": {"name": "苏丹", "parent": "3956"},
"7045": {"name": "苏里南", "parent": "3956"},
"7046": {"name": "斯瓦尔巴群岛", "parent": "3956"},
"7047": {"name": "斯威士兰", "parent": "3956"},
"7048": {"name": "瑞典", "parent": "3956"},
"7049": {"name": "瑞士", "parent": "3956"},
"7050": {"name": "叙利亚", "parent": "3956"},
"7051": {"name": "塔吉克斯坦", "parent": "3956"},
"7052": {"name": "坦桑尼亚", "parent": "3956"},
"7053": {"name": "泰国", "parent": "3956"},
"7054": {"name": "多哥", "parent": "3956"},
"7055": {"name": "托克劳", "parent": "3956"},
"7056": {"name": "汤加", "parent": "3956"},
"7057": {"name": "特立尼达和多巴哥", "parent": "3956"},
"7058": {"name": "突尼斯", "parent": "3956"},
"7059": {"name": "土耳其", "parent": "3956"},
"7060": {"name": "土库曼斯坦", "parent": "3956"},
"7061": {"name": "特克斯科斯群岛", "parent": "3956"},
"7062": {"name": "图瓦卢", "parent": "3956"},
"7063": {"name": "乌干达", "parent": "3956"},
"7064": {"name": "乌克兰", "parent": "3956"},
"7065": {"name": "阿联酋", "parent": "3956"},
"7066": {"name": "美国本土外小岛屿", "parent": "3956"},
"7067": {"name": "乌拉圭", "parent": "3956"},
"7068": {"name": "乌兹别克斯坦", "parent": "3956"},
"7069": {"name": "瓦努阿图", "parent": "3956"},
"7070": {"name": "梵蒂冈", "parent": "3956"},
"7071": {"name": "委内瑞拉", "parent": "3956"},
"7072": {"name": "越南", "parent": "3956"},
"7073": {"name": "英属维尔京群岛", "parent": "3956"},
"7074": {"name": "美属维尔京群岛", "parent": "3956"},
"7075": {"name": "瓦利斯和富图纳", "parent": "3956"},
"7076": {"name": "西撒哈拉", "parent": "3956"},
"7077": {"name": "也门", "parent": "3956"},
"7078": {"name": "南斯拉夫", "parent": "3956"},
"7079": {"name": "赞比亚", "parent": "3956"},
"7080": {"name": "津巴布韦", "parent": "3956"},
"7081": {"name": "塞尔维亚", "parent": "3956"},
"7082": {"name": "雄安新区", "parent": "4"},
"7084": {"name": "天门市", "parent": "18"},
}
NM_SET = set([v["name"] for _, v in TBL.items()])
def get_names(id):
if not id or str(id).lower() == "none":
return []
id = str(id)
if not re.match("[0-9]+$", id.strip()):
return [id]
nms = []
d = TBL.get(id)
if not d:
return []
nms.append(d["name"])
p = get_names(d["parent"])
if p:
nms.extend(p)
return nms
def isName(nm):
if nm in NM_SET:
return True
if nm + "" in NM_SET:
return True
if re.sub(r"(省|(回族|壮族|维吾尔)*自治区)$", "", nm) in NM_SET:
return True
return False

View File

@ -1,65 +0,0 @@
[
"科技",
"集团",
"网络科技",
"技术",
"信息",
"分公司",
"信息技术",
"发展",
"科技股份",
"网络",
"贸易",
"商贸",
"工程",
"企业",
"集团股份",
"商务",
"工业",
"控股集团",
"国际贸易",
"软件技术",
"数码科技",
"软件开发",
"有限",
"经营",
"科技开发",
"股份公司",
"电子技术",
"实业集团",
"责任",
"无限",
"工程技术",
"上市公司",
"技术开发",
"软件系统",
"总公司",
"网络服务",
"ltd.",
"technology",
"company",
"服务公司",
"计算机技术",
"计算机软件",
"电子信息",
"corporation",
"计算机服务",
"计算机系统",
"有限公司",
"事业部",
"公司",
"股份",
"有限责任",
"软件",
"控股",
"高科技",
"房地产",
"事业群",
"部门",
"电子商务",
"人力资源顾问",
"人力资源",
"株式会社",
"网络营销"
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,911 +0,0 @@
[
"google assistant investments",
"amazon",
"dingtalk china information",
"zhejiang alibaba communication",
"yunos",
"腾讯云",
"新浪新闻",
"网邻通",
"蚂蚁集团",
"大疆",
"恒生股份",
"sf express",
"智者天下",
"shanghai hema network",
"papayamobile",
"lexinfintech",
"industrial consumer finance",
"360搜索",
"世纪光速",
"迅雷区块链",
"赛盒科技",
"齐力电子商务",
"平安养老险",
"平安证券",
"平安好贷",
"五八新服",
"呯嘭智能",
"阿里妈妈",
"mdt",
"tencent",
"weibo",
"浪潮软件",
"阿里巴巴广告",
"mashang consumer finance",
"维沃",
"hqg , limited",
"moodys",
"搜狐支付",
"百度秀",
"新浪服务",
"零售通",
"同城艺龙",
"虾米音乐",
"贝壳集团",
"小米有品",
"滴滴自动驾驶",
"图记",
"阿里影业",
"卓联软件",
"zhejiang tmall",
"谷歌中国",
"hithink flush",
"时装科技",
"程会玩国际旅行社",
"amazon china holding limited",
"中信消金",
"当当比特物流",
"新浪新媒体咨询",
"tongcheng network",
"金山在线",
"shopping cart",
"犀互动",
"五八",
"bilibili",
"阿里星球",
"滴滴金科服务",
"美团",
"哈啰出行",
"face",
"平安健康",
"招商银行",
"连亚",
"盒马网络",
"b站",
"华为机器",
"shanghai mdt infotech",
"ping an healthkonnect",
"beijing home link real estate broker",
"花海仓",
"beijing jingdong shangke information",
"微影智能",
"酷狗游戏",
"health.pingan.com",
"众安",
"陌陌",
"海康威视数字",
"同程网",
"艾丁金融",
"知乎",
" lu",
"国际商业机器公司",
"捷信消费金融",
"恒生利融",
"china merchants bank",
"企鹅电竞",
"捷信信驰",
"360智能家居",
"小桔车服",
"homecredit",
"皮皮虾",
"畅游",
"聚爱聊",
"suning.com",
"途牛旅游网",
"花呗",
"盈店通",
"sina",
"阿里巴巴音乐",
"华为技术有限公司",
"国付宝",
"shanghai lianshang network",
"oppo",
"华为投资控股",
"beijing sohu new media information",
"times square",
"菜鸟物流",
"lingxing",
"jd digits",
"同程旅游",
"分期乐",
"火锅视频",
"天天快报",
"猎豹移动",
"五八人力资源",
"宝宝树",
"顺丰科技",
"上海西翠",
"诗程文化传播",
"dewu",
"领星网络",
"aliexpress",
"贝塔通科技",
"链家",
"花小猪",
"趣输入",
"搜狐新媒体",
"一淘",
"56",
"qq阅读",
"青桔单车",
"iflytek",
"每日优鲜电子商务",
"腾讯觅影",
"微医",
"松果网",
"paypal",
"递瑞供应链管理",
"领星",
"qunar",
"三快",
"lu.com",
"携程旅行网",
"新潮传媒",
"链家经纪",
"景域文化",
"阿里健康",
"pingpeng",
"聚划算",
"零机科技",
"街兔电单车",
"快乐购",
"华为数字能源",
"搜狐",
"陆家嘴国际金融资产交易市场",
"nanjing tuniu",
"亚马逊",
"苏宁易购",
"携程旅游",
"苏宁金服",
"babytree",
"悟空问答",
"同花顺",
"eastmoney",
"浪潮信息",
"滴滴智慧交通",
"beijing ruixun lingtong",
"平安综合金融服务",
"爱奇艺",
"小米集团",
"华为云",
"微店",
"恒生集团",
"网易有道",
"boccfc",
"世纪思速科技",
"海康消防",
"beijing xiaomi",
"众安科技",
"五八同城",
"霆程汽车租赁",
"云卖分销",
"乐信集团",
"蚂蚁",
"舶乐蜜电子商务",
"支付宝中国",
"砖块消消消",
"vivo",
"阿里互娱",
"中国平安",
"lingxihudong",
"百度网盘",
"1号店",
"字节跳动",
"京东科技",
"驴妈妈兴旅国际旅行社",
"hangzhou alibaba music",
"xunlei",
"灵犀互动娱乐",
"快手",
"youtube",
"连尚慧眼",
"腾讯体育",
"爱商在线",
"酷我音乐",
"金融壹账通",
"搜狗服务",
"banma information",
"a站",
"罗汉堂",
"薇仕网络",
"搜狐新闻",
"贝宝",
"薇仕",
"口袋时尚科技",
"穆迪咨询",
"新狐投资管理",
"hikvision",
"alimama china holding limited",
"超聚变数字",
"腾讯视频",
"恒生电子",
"百度游戏",
"绿洲",
"木瓜移动",
"红袖添香",
"店匠科技",
"易贝",
"一淘网",
"博览群书",
"唯品会",
"lazglobal",
"amap",
"芒果网",
"口碑",
"海康慧影",
"腾讯音乐娱乐",
"网易严选",
"微信",
"shenzhen lexin holding",
"hangzhou pingpeng intelligent",
"连尚网络",
"海思",
"isunor",
"蝉翼",
"阿里游戏",
"广州优视",
"优视",
"腾讯征信",
"识装",
"finserve.pingan.com",
"papaya",
"阅文",
"平安健康保险",
"考拉海购",
"网易印象",
"wifi万能钥匙",
"新浪互联服务",
"亚马逊云科技",
"迅雷看看",
"华为朗新科技",
"adyen hong kong limited",
"谷歌",
"得物",
"网心",
"cainiao network",
"沐瞳",
"linkedln",
"hundsun",
"阿里旅行",
"珍爱网",
"阿里巴巴通信",
"金山奇剑",
"tongtool",
"华为安捷信电气",
"快乐时代",
"平安寿险",
"微博",
"微跳蚤",
"oppo移动通信",
"毒",
"alimama",
"shoplazza",
"shenzhen dianjiang science and",
"众鸣世科",
"平安金融",
"狐友",
"维沃移动通信",
"tobosoft",
"齐力电商",
"ali",
"诚信通",
"行吟",
"跳舞的线",
"橙心优选",
"众安健康",
"亚马逊中国投资",
"德絮投资管理中心合伙",
"招联消费金融",
"百度文学",
"芝麻信用",
"阿里零售通",
"时装",
"花样直播",
"sogou",
"uc",
"海思半导体",
"zhongan online p&c insurance",
"新浪数字",
"驴妈妈旅游网",
"华为数字能源技术",
"京东数科",
"oracle",
"xiaomi",
"nyse",
"阳光消费金融",
"天天动听",
"大众点评",
"上海瑞家",
"trustpass",
"hundsun technologies",
"美团小贷",
"ebay",
"通途",
"tcl",
"鸿蒙",
"酷狗计算机",
"品诺保险",
"capitalg",
"康盛创想",
"58同城",
"闲鱼",
"微软",
"吉易付科技",
"理财通",
"ctrip",
"yy",
"华为数字",
"kingsoft",
"孙宁金融",
"房江湖经纪",
"youku",
"ant financial services group",
"盒马",
"sensetime",
"伊千网络",
"小豹ai翻译棒",
"shopify",
"前海微众银行",
"qd",
"gmail",
"pingpong",
"alibaba group holding limited",
"捷信时空电子商务",
"orientsec",
"乔戈里管理咨询",
"ant",
"锐讯灵通",
"兴业消费金融",
"京东叁佰陆拾度电子商务",
"新浪",
"优酷土豆",
"海康机器人",
"美团单车",
"海康存储",
"领英",
"阿里全球速卖通",
"美菜网",
"京邦达",
"安居客",
"阿里体育",
"相互宝",
"cloudwalk",
"百度智能云",
"贝壳",
"酷狗",
"sunshine consumer finance",
"掌宜",
"奇酷网",
"核新同花顺",
"阿里巴巴影业",
"节创",
"学而思网校",
"速途",
"途牛",
"阿里云计算",
"beijing sensetime",
"alibaba cloud",
"西瓜视频",
"美团优选",
"orient securities limited",
"华为朗新",
"店匠",
"shanghai weishi network",
"友盟",
"飞猪旅行",
"滴滴出行",
"alipay",
"mogu",
"dangdang",
"大麦网",
"汉军智能系统",
"百度地图",
"货车帮",
"狐狸金服",
"众安在线保险经纪",
"华为通信",
"新浪支付",
"zhihu",
"alibaba cloud computing",
"沙发视频",
"金山软件",
"ping an good doctor",
"携程",
"脉脉",
"youku information beijing",
"zhongan",
"艾丁软件",
"乒乓智能",
"蘑菇街",
"taobao",
"华为技术服务",
"仕承文化传播",
"安捷信",
"狐狸互联网小额贷款",
"节点迅捷",
"中国银行",
"搜镇",
"众安在线",
"dingtalk",
"云从科技",
"beijing jingbangda trade",
"moody s",
"滚动的天空",
"yl.pingan.com",
"奇虎",
"alihealth",
"芒果tv",
"lufax",
"美团打车",
"小桔",
"贝壳找房网",
"小米科技",
"vips",
"kindle",
"亚马逊服务",
"citic consumer finance",
"微众",
"搜狗智慧互联网医院",
"盒马鲜生",
"life.pinan.com",
"ph.com.cn",
"银联",
"cmbchina",
"平安金融科技咨询",
"微保",
"甲骨文中国",
"飞书",
"koubei shanghai information",
"企鹅辅导",
"斑马",
"平安租赁",
"云从",
"马上消费",
"hangzhou ali baba advertising",
"金山",
"赛盒",
"科大讯飞",
"金星创业投资",
"平安国际融资租赁",
"360你财富",
"西山居",
"shenzhen qianhai fourth paradigm data",
"海思光电子",
"猎户星空",
"网易公司",
"浪潮",
"粒粒橙传媒",
"招联金融",
"100. me",
"捷信信驰咨询",
"唯品仓",
"orient",
"趣拿",
"摩拜单车",
"天猫精灵",
"菜鸟",
"豹小贩",
"去哪儿",
"米家",
"哈啰单车",
"搜狐体育",
"shopify payments usa",
"高德软件",
"讯联智付",
"乐信",
"唯你搭",
"第四范式",
"菜鸟网络",
"同程",
"yy语音",
"浪潮云",
"东财",
"淘宝",
"寻梦",
"citic securities limited",
"青橙之旅",
"阿里巴巴",
"番茄小说",
"上海亿贝",
"inspur",
"babytree inc",
"海康智慧产业股权投资基金合伙合伙",
"adyen",
"艺龙",
"蚂蚁金服",
"平安金服",
"百度百科",
"unionpay",
"当当",
"阅文集团",
"东方财富",
"东方证券",
"哈罗单车",
"优酷",
"海康",
"alipay china network",
"网商银行",
"钧正",
"property.pingan.com",
"豹咖啡",
"网易",
"我爱cba",
"theduapp",
"360",
"金山数字娱乐",
"新浪阅读",
"alibabagames",
"顺丰",
"支点商贸",
"同程旅行",
"citic securities",
"ele.com",
"tal",
"fresh hema",
"运满满",
"贝壳网",
"酷狗音乐",
"鲜城",
"360健康",
"浪潮世科",
"迅雷网络",
"哔哩哔哩",
"华为电动",
"淘友天下",
"华多网络",
"xunlei networking technologies",
"云杉",
"当当网电子商务",
"津虹网络",
"wedoc cloud hangzhou holdings",
"alisports shanghai",
"旷视金智",
"钉钉中国",
"微影",
"金山快快",
"亿贝",
"wedoc",
"autonavi",
"哈啰助力车",
"google cloud",
"新浪乐居",
"京东股票",
"搜狗智慧远程医疗中心",
"中银消金",
"merchants union consumer finance",
"王者荣耀",
"百度手机",
"美团民宿",
"kaola",
"小屋",
"金山网络",
"来往",
"顺丰速运",
"腾讯课堂",
"百度在线网络",
"美团买菜",
"威视汽车",
"uc mobile",
"来赞达",
"平安健康医疗",
"豹小秘",
"尚网",
"哈勃投资",
" ping an insurance group of china ,",
"小米",
"360好药",
"qq音乐",
"lingxigames",
"faceu激萌",
"搜狗",
"sohu",
"满帮",
"vipshop",
"wishpost",
"金山世游",
"shanghai yibaimi network",
"1688",
"海康汽车",
"顺丰控股",
"华为",
"妙镜vr",
"paybkj.com",
"hellobike",
"豹来电",
"京东",
"驴妈妈",
"momo",
"平安健康险",
"哈勃科技",
"美菜",
"众安在线财产保险",
"海康威视",
"east money information",
"阿里云",
"蝉游记",
"余额宝",
"屋客",
"滴滴",
"shopify international limited",
"百度",
"阿里健康中国",
"阿里通信",
"微梦创科",
"微医云",
"轻颜相机",
"搜易居",
"趣店集团",
"美团云",
"ant group",
"金山云",
"beijing express hand",
"觅觅",
"支付宝",
"滴滴承信科技咨询服务",
"拼多多",
"众安运动",
"乞力电商",
"youcash",
"唯品金融",
"陆金所",
"本地生活",
"sz dji",
"海康智能",
"魔方网聘",
"青藤大学",
"international business machines",
"学而思",
"beijing zhongming century science and",
"猎豹清理大师",
"asinking",
"高德",
"苏宁",
"优酷网",
"艾丁",
"中银消费金融",
"京东健康",
"五八教育",
"pingpongx",
"搜狐时尚",
"阿里广告",
"平安财险",
"中邮消金",
"etao",
"怕怕",
"nyse:cmcm",
"华为培训中心",
"高德地图",
"云狐天下征信",
"大疆创新",
"连尚",
"壹佰米",
"康健公司",
"iqiyi.com",
"360安全云盘",
"馒头直播",
"淘友网",
"东方赢家",
"bank of china",
"微众银行",
"阿里巴巴国际站",
"虾米",
"去哪儿网",
"ctrip travel network shanghai",
"潇湘书院",
"腾讯",
"快乐阳光互动娱乐传媒",
"迅雷",
"weidian",
"滴滴货运",
"ping an puhui enterprise management",
"新浪仓石基金销售",
"搜狐焦点",
"alibaba pictures",
"wps",
"平安",
"lazmall",
"百度开放平台",
"兴业消金",
" 珍爱网",
"京东云",
"小红书",
"1688. com",
"如视智数",
"missfresh",
"pazl.pingan.cn",
"平安集团",
"kugou",
"懂车帝",
"斑马智行",
"浪潮集团",
"netease hangzhou network",
"pagd.net",
"探探",
"chinaliterature",
"amazon亚马逊",
"alphabet",
"当当文创手工艺品电子商务",
"五八邦",
"shenzhen zhenai network information",
"lingshoutong",
"字节",
"lvmama",
"金山办公",
"众安保险",
"时装信息",
"优视科技",
"guangzhou kugou",
"ibm",
"滴滴打车",
"beijing sogou information service",
"megvii",
"健谈哥",
"cloudwalk group",
"蜂联科技",
"冬云",
"京东尚科",
"钢琴块2",
"京东世纪",
"商汤",
"众鸣世纪",
"腾讯音乐",
"迅雷网文化",
"华为云计算技术",
"live.me",
"全球速卖通",
"快的打车",
"hello group inc",
"美丽说",
"suning",
"opengauss",
"lazada",
"tmall",
"acfun",
"当当网",
"中银",
"旷视科技",
"百度钱包",
"淘宝网",
"新浪微博",
"迅雷集团",
"中信消费金融",
"学而思教育",
"平安普惠",
"悟空跨境",
"irobotbox",
"平安产险",
"inspur group",
"世纪卓越快递服务",
"奇虎360",
"webank",
"偶藻",
"唯品支付",
"腾讯云计算",
"众安服务",
"亿之唐",
"beijing 58 information ttechnology",
"平安好医生",
"迅雷之锤",
"旅行小账本",
"芒果游戏",
"新浪传媒",
"旷镜博煊",
"全民k歌",
"滴滴支付",
"北京网心科技",
"挂号网",
"萤石",
"chinavision media group limited",
"猎豹安全大师",
"cmcm",
"趣店",
"蚂蚁财富",
"商汤科技",
"甲骨文",
"百度云",
"百度apollo",
"19 pay",
"stock.pingan.com",
"tiktok",
"alibaba pictures group limited",
"ele",
"考拉",
"天猫",
"腾讯优图",
"起点中文网",
"百度视频",
"shanghai bili bili",
"京东物流",
"ebay marketplaces gmbh",
"alibaba sport",
"wish",
"阿里巴巴中国",
"中国银联",
"alibaba china network",
"china ping an property insurance",
"百度糯米网",
"微软中国",
"一九付",
"4 paradigm",
"叮咚买菜",
"umeng",
"众鸣科技",
"平安财富通",
"google",
"巨量引擎",
"百度贴吧",
"beijing jingdong century information",
"讯飞",
"beijing yunshan information",
"满运软件",
"中邮消费金融",
"饿了么",
"alios",
"腾讯ai实验室",
"第四范式智能",
"瀚星创业投资",
"gradient ventures",
"microsoft",
"哈啰共享汽车",
"乞力电子商务",
"mscf",
"网易影业文化",
"铁友旅游咨询",
"kilimall",
"云企互联投资",
"ping an financial consulting",
"beijng jingdong century commerce",
"高德威智能交通系统",
"中友信息",
"平安医疗健康管理",
"eciticcfc",
"中信证券",
"fliggy",
"电子湾",
"旷云金智",
"微粒贷",
"rsi",
"滴滴云计算",
"google ventures",
"箐程",
"每日优鲜",
"音兔",
"拉扎斯",
"今日头条",
"乐信控股",
"猎豹浏览器",
"细微咨询",
"好未来",
"我乐",
"绘声绘色",
"抖音",
"搜狐新时代",
"飞猪",
"鹅厂",
"贝壳找房",
"tuniu",
"红马传媒文化",
"钉钉",
"马上消费金融",
"360手机",
"平安医保",
"快途",
"alibaba",
"小哈换电",
"大麦",
"恒睿人工智能研究院",
"谷歌资本",
"猎豹",
"穆迪信息"
]

View File

@ -1,595 +0,0 @@
[
"中国科技大学",
"国防科学技术大学",
"清华大学",
"清华",
"tsinghua university",
"thu",
"北京大学",
"北大",
"beijing university",
"pku",
"中国科学技术大学",
"中国科大",
"中科大",
"china science & technology university",
"ustc",
"复旦大学",
"复旦",
"fudan university",
"fdu",
"中国人民大学",
"人大",
"人民大学",
"renmin university of china",
"ruc",
"上海交通大学",
"上海交大",
"shanghai jiao tong university",
"sjtu",
"南京大学",
"南大",
"nanjing university",
"nju",
"同济大学",
"同济",
"tongji university",
"tongji",
"浙江大学",
"浙大",
"zhejiang university",
"zju",
"南开大学",
"南开",
"nankai university",
"nku",
"北京航空航天大学",
"北航",
"beihang university",
"buaa",
"北京师范大学",
"北师",
"北师大",
"beijing normal university",
"bnu",
"武汉大学",
"武大",
"wuhan university",
"whu",
"西安交通大学",
"西安交大",
"xian jiaotong university",
"xjtu",
"天津大学",
"天大",
"university of tianjin",
"tju",
"华中科技大学",
"华中大",
"central china university science and technology",
"hust",
"北京理工大学",
"北理",
"beijing institute of technology",
"bit",
"东南大学",
"东大",
"southeast china university",
"seu",
"中山大学",
"中大",
"zhongshan university",
"sysu",
"华东师范大学",
"华师大",
"east china normal university",
"ecnu",
"哈尔滨工业大学",
"哈工大",
"harbin institute of technology",
"hit",
"厦门大学",
"厦大",
"xiamen university",
"xmu",
"西北工业大学",
"西工大",
"西北工大",
"northwestern polytechnical university",
"npu",
"中南大学",
"中南",
"middle and southern university",
"csu",
"大连理工大学",
"大工",
"institute of technology of dalian",
"dut",
"四川大学",
"川大",
"sichuan university",
"scu",
"电子科技大学",
"电子科大",
"university of electronic science and technology of china",
"uestc",
"华南理工大学",
"华南理工",
"institutes of technology of south china",
"scut",
"吉林大学",
"吉大",
"jilin university",
"jlu",
"湖南大学",
"湖大",
"hunan university",
"hnu",
"重庆大学",
"重大",
"university of chongqing",
"cqu",
"山东大学",
"山大",
"shandong university",
"sdu",
"中国农业大学",
"中国农大",
"china agricultural university",
"cau",
"中国海洋大学",
"中国海大",
"chinese marine university",
"ouc",
"中央民族大学",
"中央民大",
"central university for nationalities",
"muc",
"东北大学",
"东北工学院",
"northeastern university",
"neu 或 nu",
"兰州大学",
"兰大",
"lanzhou university",
"lzu",
"西北农林科技大学",
"西农","西北农大",
"northwest a&f university",
"nwafu",
"中国人民解放军国防科技大学",
"国防科技大学","国防科大",
"national university of defense technology",
"nudt",
"郑州大学",
"郑大",
"zhengzhou university",
"zzu",
"云南大学",
"云大",
"yunnan university",
"ynu",
"新疆大学",
"新大",
"xinjiang university",
"xju",
"北京交通大学",
"北京交大",
"beijing jiaotong university",
"bjtu",
"北京工业大学",
"北工大",
"beijing university of technology",
"bjut",
"北京科技大学",
"北科大","北京科大",
"university of science and technology beijing",
"ustb",
"北京化工大学",
"北化",
"beijing university of chemical technology",
"buct",
"北京邮电大学",
"北邮",
"beijing university of posts and telecommunications",
"beijing university of post and telecommunications",
"beijing university of post and telecommunication",
"beijing university of posts and telecommunication",
"bupt",
"北京林业大学",
"北林",
"beijing forestry university",
"bfu",
"北京协和医学院",
"协和医学院",
"peking union medical college",
"pumc",
"北京中医药大学",
"北中医",
"beijing university of chinese medicine",
"bucm",
"首都师范大学",
"首师大",
"capital normal university",
"cnu",
"北京外国语大学",
"北外",
"beijing foreign studies university",
"bfsu",
"中国传媒大学",
"中媒",
"中传",
"北京广播学院",
"communication university of china",
"cuc",
"中央财经大学",
"中央财大",
"中财大",
"the central university of finance and economics",
"cufe",
"对外经济贸易大学",
"对外经贸大学",
"贸大",
"university of international business and economics",
"uibe",
"外交学院",
"外院",
"china foreign affairs university",
"cfau",
"中国人民公安大学",
"公安大学",
"people's public security university of china",
"ppsuc",
"北京体育大学",
"北体大",
"beijing sport university",
"bsu",
"中央音乐学院",
"央音",
"中央院",
"central conservatory of music",
"ccom",
"中国音乐学院",
"国音",
"中国院",
"china conservatory of music",
"ccmusic",
"中央美术学院",
"央美",
"central academy of fine art",
"cafa",
"中央戏剧学院",
"中戏",
"the central academy of drama",
"tcad",
"中国政法大学",
"法大",
"china university of political science and law",
"zuc",
"cupl",
"中国科学院大学",
"国科大",
"科院大",
"university of chinese academy of sciences",
"ucas",
"福州大学",
"福大",
"university of fuzhou",
"fzu",
"暨南大学",
"暨大",
"ji'nan university",
"jnu",
"广州中医药大学",
"广中医",
"traditional chinese medicine university of guangzhou",
"gucm",
"华南师范大学",
"华南师大",
"south china normal university",
"scnu",
"广西大学",
"西大",
"guangxi university",
"gxu",
"贵州大学",
"贵大",
"guizhou university",
"gzu",
"海南大学",
"海大",
"university of hainan",
"hainu",
"河南大学",
"河大",
"he'nan university",
"henu",
"哈尔滨工程大学",
"哈工程",
"harbin engineering university",
"heu",
"东北农业大学",
"东北农大",
"northeast agricultural university",
"neau",
"东北林业大学",
"东北林大",
"northeast forestry university",
"nefu",
"中国地质大学",
"地大",
"china university of geosciences",
"cug",
"武汉理工大学",
"武汉理工",
"wuhan university of technology",
"wut",
"华中农业大学",
"华中农大",
"华农",
"central china agricultural university",
"hzau",
"华中师范大学",
"华中师大",
"华大",
"central china normal university",
"ccnu",
"中南财经政法大学",
"中南大",
"zhongnan university of economics & law",
"zuel",
"湖南师范大学",
"湖南师大",
"hunan normal university",
"hunnu",
"延边大学",
"延大",
"yanbian university",
"ybu",
"东北师范大学",
"东北师大",
"northeast normal university",
"nenu",
"苏州大学",
"苏大",
"soochow university",
"suda",
"南京航空航天大学",
"南航",
"nanjing aero-space university",
"nuaa",
"南京理工大学",
"南理工",
"institutes of technology of nanjing",
"njust",
"中国矿业大学",
"中国矿大",
"china mining university",
"cumt",
"南京邮电大学",
"南邮",
"nanjing university of posts and telecommunications",
"njupt",
"河海大学",
"河海",
"river sea university",
"hhu",
"江南大学",
"江南大",
"jiangnan university",
"jiangnan",
"南京林业大学",
"南林",
"nanjing forestry university",
"njfu",
"南京信息工程大学",
"南信大",
"nanjing university of information science and technology",
"nuist",
"南京农业大学",
"南农",
"南农大",
"南京农大",
"agricultural university of nanjing",
"njau",
"nau",
"南京中医药大学",
"南中医",
"nanjing university of chinese medicine",
"njucm",
"中国药科大学",
"中国药大",
"china medicine university",
"cpu",
"南京师范大学",
"南京师大",
"南师大",
"南师",
"nanjing normal university",
"nnu",
"南昌大学",
"昌大",
"university of nanchang","nanchang university",
"ncu",
"辽宁大学",
"辽大",
"liaoning university",
"lnu",
"大连海事大学",
"大连海大",
"海大",
"maritime affairs university of dalian",
"dmu",
"内蒙古大学",
"内大",
"university of the inner mongol","inner mongolia university",
"imu",
"宁夏大学",
"宁大",
"ningxia university",
"nxu",
"青海大学",
"清大",
"qinghai university",
"qhu",
"中国石油大学",
"中石大",
"china university of petroleum beijing",
"upc",
"太原理工大学",
"太原理工",
"institutes of technology of taiyuan","taiyuan university of technology",
"tyut",
"西北大学",
"西大",
"northwest university",
"nwu",
"西安电子科技大学",
"西电",
"xidian university",
"xdu",
"长安大学",
"长大",
"chang`an university",
"chu",
"陕西师范大学",
"陕西师大",
"陕师大",
"shaanxi normal university",
"snnu",
"第四军医大学",
"空军军医大学","四医大",
"air force medical university",
"fmmu",
"华东理工大学",
"华理",
"east china university of science",
"ecust",
"东华大学",
"东华",
"donghua university",
"dhu",
"上海海洋大学",
"上海海大",
"shanghai ocean university",
"shou",
"上海中医药大学",
"上中医",
"shanghai university of traditional chinese medicine",
"shutcm",
"上海外国语大学",
"上外",
"shanghai international studies university",
"sisu",
"上海财经大学",
"上海财大",
"上财",
"shanghai university of finance",
"sufe",
"上海体育学院",
"shanghai university of sport",
"上海音乐学院",
"上音",
"shanghai conservatory of music",
"shcm",
"上海大学",
"上大",
"shanghai university",
"第二军医大学",
"海军军医大学",
"naval medical university",
"西南交通大学",
"西南交大",
"southwest jiaotong university",
"swjtu",
"西南石油大学",
"西南石大",
"southwest petroleum university",
"swpu",
"成都理工大学",
"成都理工",
"chengdu university of technology",
"cdut ",
"四川农业大学",
"川农",
"川农大",
"sichuan agricultural university",
"sicau",
"成都中医药大学",
"成中医",
"chengdu university of tcm",
"cdutcm",
"西南财经大学",
"西南财大",
"西财",
"southwestern university of finance and economics",
"swufe",
"天津工业大学",
"天工大",
"tianjin university of technology",
"tgu",
"天津医科大学",
"天津医大",
"medical university of tianjin",
"tmu",
"天津中医药大学",
"天中",
"tianjin university of traditional chinese medicine",
"tutcm",
"华北电力大学",
"华电",
"north china electric power university",
"ncepu",
"河北工业大学",
"河工大",
"hebei university of technology",
"hebut",
"西藏大学",
"藏大",
"tibet university",
"tu",
"石河子大学",
"石大",
"shihezi university",
"中国美术学院",
"中国美院",
"国美",
"china academy of art",
"caa",
"宁波大学",
"宁大",
"ningbo university",
"nbu",
"西南大学",
"西大",
"southwest university",
"swu",
"安徽大学",
"安大",
"university of anhui",
"ahu",
"合肥工业大学",
"合肥工大",
"合工大",
"hefei university of technology",
"hfut",
"中国地质大学",
"地大",
"china university of geosciences",
"cug",
"中国地质大学",
"地大",
"北京地大",
"cugb",
"中国矿业大学",
"中国矿大",
"china university of mining & technology",
"cumtb",
"中国石油大学",
"中石大",
"石大",
"china university of petroleum",
"cup",
"中国石油大学",
"中石大",
"cup"]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,91 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import json
import re
import copy
import pandas as pd
current_file_path = os.path.dirname(os.path.abspath(__file__))
TBL = pd.read_csv(
os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0
).fillna("")
TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r",encoding="utf-8"))
GOOD_SCH = set([re.sub(r"[,. &()]+", "", c) for c in GOOD_SCH])
def loadRank(fnm):
global TBL
TBL["rank"] = 1000000
with open(fnm, "r", encoding="utf-8") as f:
while True:
line = f.readline()
if not line:
break
line = line.strip("\n").split(",")
try:
nm, rk = line[0].strip(), int(line[1])
# assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
except Exception:
pass
loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
def split(txt):
tks = []
for t in re.sub(r"[ \t]+", " ", txt).split():
if (
tks
and re.match(r".*[a-zA-Z]$", tks[-1])
and re.match(r"[a-zA-Z]", t)
and tks
):
tks[-1] = tks[-1] + " " + t
else:
tks.append(t)
return tks
def select(nm):
global TBL
if not nm:
return
if isinstance(nm, list):
nm = str(nm[0])
nm = split(nm)[0]
nm = str(nm).lower().strip()
nm = re.sub(r"[(][^()]+[)]", "", nm.lower())
nm = re.sub(r"(^the |[,.&();;·]+|^(英国|美国|瑞士))", "", nm)
nm = re.sub(r"大学.*学院", "大学", nm)
tbl = copy.deepcopy(TBL)
tbl["hit_alias"] = tbl["alias"].map(lambda x: nm in set(x.split("+")))
res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | tbl.hit_alias)]
if res.empty:
return
return json.loads(res.to_json(orient="records"))[0]
def is_good(nm):
global GOOD_SCH
nm = re.sub(r"[(][^()]+[)]", "", nm.lower())
nm = re.sub(r"[''`‘’“”,. &();]+", "", nm)
return nm in GOOD_SCH

View File

@ -1,189 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
from deepdoc.parser.resume.entities import degrees, regions, industries
FIELDS = [
"address STRING",
"annual_salary int",
"annual_salary_from int",
"annual_salary_to int",
"birth STRING",
"card STRING",
"certificate_obj string",
"city STRING",
"corporation_id int",
"corporation_name STRING",
"corporation_type STRING",
"degree STRING",
"discipline_name STRING",
"education_obj string",
"email STRING",
"expect_annual_salary int",
"expect_city_names string",
"expect_industry_name STRING",
"expect_position_name STRING",
"expect_salary_from int",
"expect_salary_to int",
"expect_type STRING",
"gender STRING",
"industry_name STRING",
"industry_names STRING",
"is_deleted STRING",
"is_fertility STRING",
"is_house STRING",
"is_management_experience STRING",
"is_marital STRING",
"is_oversea STRING",
"language_obj string",
"name STRING",
"nation STRING",
"phone STRING",
"political_status STRING",
"position_name STRING",
"project_obj string",
"responsibilities string",
"salary_month int",
"scale STRING",
"school_name STRING",
"self_remark string",
"skill_obj string",
"title_name STRING",
"tob_resume_id STRING",
"updated_at Timestamp",
"wechat STRING",
"work_obj string",
"work_experience int",
"work_start_time BIGINT"
]
def refactor(df):
def deal_obj(obj, k, kk):
if not isinstance(obj, type({})):
return ""
obj = obj.get(k, {})
if not isinstance(obj, type({})):
return ""
return obj.get(kk, "")
def loadjson(line):
try:
return json.loads(line)
except Exception:
pass
return {}
df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
df.fillna("", inplace=True)
clms = ["tob_resume_id", "updated_at"]
def extract(nms, cc=None):
nonlocal clms
clms.extend(nms)
for c in nms:
if cc:
df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
else:
df[c] = df["obj"].map(
lambda x: json.dumps(
x.get(
c,
{}),
ensure_ascii=False) if isinstance(
x,
type(
{})) and (
isinstance(
x.get(c),
type(
{})) or not x.get(c)) else str(x).replace(
"None",
""))
extract(["education", "work", "certificate", "project", "language",
"skill"])
extract(["wechat", "phone", "is_deleted",
"name", "tel", "email"], "contact")
extract(["nation", "expect_industry_name", "salary_month",
"industry_ids", "is_house", "birth", "annual_salary_from",
"annual_salary_to", "card",
"expect_salary_to", "expect_salary_from",
"expect_position_name", "gender", "city",
"is_fertility", "expect_city_names",
"political_status", "title_name", "expect_annual_salary",
"industry_name", "address", "position_name", "school_name",
"corporation_id",
"is_oversea", "responsibilities",
"work_start_time", "degree", "management_experience",
"expect_type", "corporation_type", "scale", "corporation_name",
"self_remark", "annual_salary", "work_experience",
"discipline_name", "marital", "updated_at"], "basic")
df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
str(x).split(",")]))
clms.append("industry_names")
def arr2str(a):
if not a:
return ""
if isinstance(a, list):
a = " ".join([str(i) for i in a])
return str(a).replace(",", " ")
df["expect_industry_name"] = df["expect_industry_name"].map(
lambda x: arr2str(x))
df["gender"] = df["gender"].map(
lambda x: "" if x == 'M' else (
"" if x == 'F' else ""))
for c in ["is_fertility", "is_oversea", "is_house",
"management_experience", "marital"]:
df[c] = df[c].map(
lambda x: '' if x == 'Y' else (
'' if x == 'N' else ""))
df["is_management_experience"] = df["management_experience"]
df["is_marital"] = df["marital"]
clms.extend(["is_management_experience", "is_marital"])
df.fillna("", inplace=True)
for i in range(len(df)):
if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
df.loc[i, "phone"] = df.loc[i, "tel"].strip()
for n in ["industry_ids", "management_experience", "marital", "tel"]:
for i in range(len(clms)):
if clms[i] == n:
del clms[i]
break
clms = list(set(clms))
df = df.reindex(sorted(clms), axis=1)
#print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
for c in clms:
df[c] = df[c].map(
lambda s: str(s).replace(
"\t",
" ").replace(
"\n",
"\\n").replace(
"\r",
"\\n"))
# print(df.values.tolist())
return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))

View File

@ -1,696 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
import copy
import time
import datetime
import demjson3
import traceback
import signal
import numpy as np
from deepdoc.parser.resume.entities import degrees, schools, corporations
from rag.nlp import rag_tokenizer, surname
from xpinyin import Pinyin
from contextlib import contextmanager
class TimeoutException(Exception):
pass
@contextmanager
def time_limit(seconds):
def signal_handler(signum, frame):
raise TimeoutException("Timed out!")
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)
ENV = None
PY = Pinyin()
def rmHtmlTag(line):
return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
def highest_degree(dg):
if not dg:
return ""
if isinstance(dg, str):
dg = [dg]
m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
def forEdu(cv):
if not cv.get("education_obj"):
cv["integerity_flt"] *= 0.8
return cv
first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
edu_nst = []
edu_end_dt = ""
cv["school_rank_int"] = 1000000
for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
e = {}
if n.get("end_time"):
if n["end_time"] > edu_end_dt:
edu_end_dt = n["end_time"]
try:
dt = n["end_time"]
if re.match(r"[0-9]{9,}", dt):
dt = turnTm2Dt(dt)
y, m, d = getYMD(dt)
ed_dt.append(str(y))
e["end_dt_kwd"] = str(y)
except Exception as e:
pass
if n.get("start_time"):
try:
dt = n["start_time"]
if re.match(r"[0-9]{9,}", dt):
dt = turnTm2Dt(dt)
y, m, d = getYMD(dt)
st_dt.append(str(y))
e["start_dt_kwd"] = str(y)
except Exception:
pass
r = schools.select(n.get("school_name", ""))
if r:
if str(r.get("type", "")) == "1":
fea.append("211")
if str(r.get("type", "")) == "2":
fea.append("211")
if str(r.get("is_abroad", "")) == "1":
fea.append("留学")
if str(r.get("is_double_first", "")) == "1":
fea.append("双一流")
if str(r.get("is_985", "")) == "1":
fea.append("985")
if str(r.get("is_world_known", "")) == "1":
fea.append("海外知名")
if r.get("rank") and cv["school_rank_int"] > r["rank"]:
cv["school_rank_int"] = r["rank"]
if n.get("school_name") and isinstance(n["school_name"], str):
sch.append(re.sub(r"(211|985|重点大学|[,&;-])", "", n["school_name"]))
e["sch_nm_kwd"] = sch[-1]
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
maj.append(n["discipline_name"])
e["major_kwd"] = n["discipline_name"]
if not n.get("degree") and "985" in fea and not first_fea:
n["degree"] = "1"
if n.get("degree"):
d = degrees.get_name(n["degree"])
if d:
e["degree_kwd"] = d
if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
d = "专升本"
if d:
deg.append(d)
# for first degree
if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
fdeg = [d]
if n.get("school_name"):
fsch = [n["school_name"]]
if n.get("discipline_name"):
fmaj = [n["discipline_name"]]
first_fea = copy.deepcopy(fea)
edu_nst.append(e)
cv["sch_rank_kwd"] = []
if cv["school_rank_int"] <= 20 \
or ("海外名校" in fea and cv["school_rank_int"] <= 200):
cv["sch_rank_kwd"].append("顶尖学校")
elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
cv["school_rank_int"] > 200):
cv["sch_rank_kwd"].append("精英学校")
elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
or ("海外名校" in fea and cv["school_rank_int"] > 500):
cv["sch_rank_kwd"].append("优质学校")
else:
cv["sch_rank_kwd"].append("一般学校")
if edu_nst:
cv["edu_nst"] = edu_nst
if fea:
cv["edu_fea_kwd"] = list(set(fea))
if first_fea:
cv["edu_first_fea_kwd"] = list(set(first_fea))
if maj:
cv["major_kwd"] = maj
if fsch:
cv["first_school_name_kwd"] = fsch
if fdeg:
cv["first_degree_kwd"] = fdeg
if fmaj:
cv["first_major_kwd"] = fmaj
if st_dt:
cv["edu_start_kwd"] = st_dt
if ed_dt:
cv["edu_end_kwd"] = ed_dt
if ed_dt:
cv["edu_end_int"] = max([int(t) for t in ed_dt])
if deg:
if "本科" in deg and "专科" in deg:
deg.append("专升本")
deg = [d for d in deg if d != '本科']
cv["degree_kwd"] = deg
cv["highest_degree_kwd"] = highest_degree(deg)
if edu_end_dt:
try:
if re.match(r"[0-9]{9,}", edu_end_dt):
edu_end_dt = turnTm2Dt(edu_end_dt)
if edu_end_dt.strip("\n") == "至今":
edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
y, m, d = getYMD(edu_end_dt)
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
except Exception as e:
logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
if sch:
cv["school_name_kwd"] = sch
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
or not cv.get("degree_kwd"):
for c in sch:
if schools.is_good(c):
if "tag_kwd" not in cv:
cv["tag_kwd"] = []
cv["tag_kwd"].append("好学校")
cv["tag_kwd"].append("好学历")
break
if (len(cv.get("degree_kwd", [])) >= 1 and \
"本科" in cv["degree_kwd"] and \
any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
if "tag_kwd" not in cv:
cv["tag_kwd"] = []
if "好学历" not in cv["tag_kwd"]:
cv["tag_kwd"].append("好学历")
if cv.get("major_kwd"):
cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
if cv.get("school_name_kwd"):
cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
if cv.get("first_school_name_kwd"):
cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
if cv.get("first_major_kwd"):
cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
return cv
def forProj(cv):
if not cv.get("project_obj"):
return cv
pro_nms, desc = [], []
for i, n in enumerate(
sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
reverse=True)):
if n.get("name"):
pro_nms.append(n["name"])
if n.get("describe"):
desc.append(str(n["describe"]))
if n.get("responsibilities"):
desc.append(str(n["responsibilities"]))
if n.get("achivement"):
desc.append(str(n["achivement"]))
if pro_nms:
# cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
if desc:
cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
return cv
def json_loads(line):
return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
def forWork(cv):
if not cv.get("work_obj"):
cv["integerity_flt"] *= 0.7
return cv
flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
"industry_name", "subordinates_count"]
duas = []
scales = []
fea = {c: [] for c in flds}
latest_job_tm = ""
goodcorp = False
goodcorp_ = False
work_st_tm = ""
corp_tags = []
for i, n in enumerate(
sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
reverse=True)):
if isinstance(n, str):
try:
n = json_loads(n)
except Exception:
continue
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
work_st_tm = n["start_time"]
for c in flds:
if not n.get(c) or str(n[c]) == '0':
fea[c].append("")
continue
if c == "corporation_name":
n[c] = corporations.corpNorm(n[c], False)
if corporations.is_good(n[c]):
if i == 0:
goodcorp = True
else:
goodcorp_ = True
ct = corporations.corp_tag(n[c])
if i == 0:
corp_tags.extend(ct)
elif ct and ct[0] != "软外":
corp_tags.extend([f"{t}(曾)" for t in ct])
fea[c].append(rmHtmlTag(str(n[c]).lower()))
y, m, d = getYMD(n.get("start_time"))
if not y or not m:
continue
st = "%s-%02d-%02d" % (y, int(m), int(d))
latest_job_tm = st
y, m, d = getYMD(n.get("end_time"))
if (not y or not m) and i > 0:
continue
if not y or not m or int(y) > 2022:
y, m, d = getYMD(str(n.get("updated_at", "")))
if not y or not m:
continue
ed = "%s-%02d-%02d" % (y, int(m), int(d))
try:
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
except Exception:
logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
if n.get("scale"):
r = re.search(r"^([0-9]+)", str(n["scale"]))
if r:
scales.append(int(r.group(1)))
if goodcorp:
if "tag_kwd" not in cv:
cv["tag_kwd"] = []
cv["tag_kwd"].append("好公司")
if goodcorp_:
if "tag_kwd" not in cv:
cv["tag_kwd"] = []
cv["tag_kwd"].append("好公司(曾)")
if corp_tags:
if "tag_kwd" not in cv:
cv["tag_kwd"] = []
cv["tag_kwd"].extend(corp_tags)
cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
if latest_job_tm:
cv["latest_job_dt"] = latest_job_tm
if fea["corporation_id"]:
cv["corporation_id"] = fea["corporation_id"]
if fea["position_name"]:
cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
if fea["industry_name"]:
cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
if fea["corporation_name"]:
cv["corporation_name_kwd"] = fea["corporation_name"][0]
cv["corp_nm_kwd"] = fea["corporation_name"]
cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
if fea["responsibilities"]:
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
if fea["subordinates_count"]:
fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
re.match(r"[^0-9]+$", str(i))]
if fea["subordinates_count"]:
cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
if isinstance(cv.get("corporation_id"), int):
cv["corporation_id"] = [str(cv["corporation_id"])]
if not cv.get("corporation_id"):
cv["corporation_id"] = []
for i in cv.get("corporation_id", []):
cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
if work_st_tm:
try:
if re.match(r"[0-9]{9,}", work_st_tm):
work_st_tm = turnTm2Dt(work_st_tm)
y, m, d = getYMD(work_st_tm)
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
except Exception as e:
logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
cv["job_num_int"] = 0
if duas:
cv["dua_flt"] = np.mean(duas)
cv["cur_dua_int"] = duas[0]
cv["job_num_int"] = len(duas)
if scales:
cv["scale_flt"] = np.max(scales)
return cv
def turnTm2Dt(b):
if not b:
return
b = str(b).strip()
if re.match(r"[0-9]{10,}", b):
b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
return b
def getYMD(b):
y, m, d = "", "", "01"
if not b:
return (y, m, d)
b = turnTm2Dt(b)
if re.match(r"[0-9]{4}", b):
y = int(b[:4])
r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
if r:
m = r.group(1)
r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
if r:
d = r.group(1)
if not d or int(d) == 0 or int(d) > 31:
d = "1"
if not m or int(m) > 12 or int(m) < 1:
m = "1"
return (y, m, d)
def birth(cv):
if not cv.get("birth"):
cv["integerity_flt"] *= 0.9
return cv
y, m, d = getYMD(cv["birth"])
if not m or not y:
return cv
b = "%s-%02d-%02d" % (y, int(m), int(d))
cv["birth_dt"] = b
cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
cv["age_int"] = datetime.datetime.now().year - int(y)
return cv
def parse(cv):
for k in cv.keys():
if cv[k] == '\\N':
cv[k] = ''
# cv = cv.asDict()
tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
"expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
"position_name", "school_name", "self_remark", "title_name"]
small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
"expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
"industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
"expect_salary_to", "salary_month"]
is_fld = [
("is_fertility", "已育", "未育"),
("is_house", "有房", "没房"),
("is_management_experience", "有管理经验", "无管理经验"),
("is_marital", "已婚", "未婚"),
("is_oversea", "有海外经验", "无海外经验")
]
rmkeys = []
for k in cv.keys():
if cv[k] is None:
rmkeys.append(k)
if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
rmkeys.append(k)
for k in rmkeys:
del cv[k]
integerity = 0.
flds_num = 0.
def hasValues(flds):
nonlocal integerity, flds_num
flds_num += len(flds)
for f in flds:
v = str(cv.get(f, ""))
if len(v) > 0 and v != '0' and v != '[]':
integerity += 1
hasValues(tks_fld)
hasValues(small_tks_fld)
hasValues(kwd_fld)
hasValues(num_fld)
cv["integerity_flt"] = integerity / flds_num
if cv.get("corporation_type"):
for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
(r"[/.· <\(]+.*", ""),
(r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
(r".*(机关|事业).*", "机关"),
(r".*(非盈利|Non-profit).*", "非盈利"),
(r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
(r".*国有.*", "国企"),
(r"[ \(\)人/·0-9-]+", ""),
(r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
if len(cv["corporation_type"]) < 2:
del cv["corporation_type"]
if cv.get("political_status"):
for p, r in [
(r".*党员.*", "党员"),
(r".*(无党派|公民).*", "群众"),
(r".*团员.*", "团员")]:
cv["political_status"] = re.sub(p, r, cv["political_status"])
if not re.search(r"[党团群]", cv["political_status"]):
del cv["political_status"]
if cv.get("phone"):
cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
keys = list(cv.keys())
for k in keys:
# deal with json objects
if k.find("_obj") > 0:
try:
cv[k] = json_loads(cv[k])
cv[k] = [a for _, a in cv[k].items()]
nms = []
for n in cv[k]:
if not isinstance(n, dict) or "name" not in n or not n.get("name"):
continue
n["name"] = re.sub(r"(442|\t )", "", n["name"]).strip().lower()
if not n["name"]:
continue
nms.append(n["name"])
if nms:
t = k[:-4]
cv[f"{t}_kwd"] = nms
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
except Exception:
logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
cv[k] = []
# tokenize fields
if k in tks_fld:
cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
if k in small_tks_fld:
cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
# keyword fields
if k in kwd_fld:
cv[f"{k}_kwd"] = [n.lower()
for n in re.split(r"[\t,;. ]",
re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1\2", cv[k])
) if n]
if k in num_fld and cv.get(k):
cv[f"{k}_int"] = cv[k]
cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
# for name field
if cv.get("name"):
nm = re.sub(r"[\n——\-\(\+].*", "", cv["name"].strip())
nm = re.sub(r"[ \t ]+", " ", nm)
if re.match(r"[a-zA-Z ]+$", nm):
if len(nm.split()) > 1:
cv["name"] = nm
else:
nm = ""
elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
else:
nm = ""
cv["name"] = nm.strip()
name = cv["name"]
# name pingyin and its prefix
cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
cv["name_py_pref0_tks"] = ""
cv["name_py_pref_tks"] = ""
for py in PY.get_pinyins(nm[:20], ''):
for i in range(2, len(py) + 1):
cv["name_py_pref_tks"] += " " + py[:i]
for py in PY.get_pinyins(nm[:20], ' '):
py = py.split()
for i in range(1, len(py) + 1):
cv["name_py_pref0_tks"] += " " + "".join(py[:i])
cv["name_kwd"] = name
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
cv["name_tks"] = (
rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
) if name else ""
else:
cv["integerity_flt"] /= 2.
if cv.get("phone"):
r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
if not r:
cv["phone"] = ""
else:
cv["phone"] = r.group(1)
# deal with date fields
if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
else:
y, m, d = getYMD(str(cv.get("updated_at", "")))
if not y:
y = "2012"
if not m:
m = "01"
if not d:
d = "01"
cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
# long text tokenize
if cv.get("responsibilities"):
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
# for yes or no field
fea = []
for f, y, n in is_fld:
if f not in cv:
continue
if cv[f] == '':
fea.append(y)
if cv[f] == '':
fea.append(n)
if fea:
cv["tag_kwd"] = fea
cv = forEdu(cv)
cv = forProj(cv)
cv = forWork(cv)
cv = birth(cv)
cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
for j in cv.get("sch_rank_kwd", []):
cv["corp_proj_sch_deg_kwd"][i] += "+" + j
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
if cv.get("highest_degree_kwd"):
cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
try:
if not cv.get("work_exp_flt") and cv.get("work_start_time"):
if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
y, m, d = getYMD(str(cv["work_start_time"]))
cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
except Exception as e:
logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
if "work_exp_flt" not in cv and cv.get("work_experience", 0):
cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
keys = list(cv.keys())
for k in keys:
if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
del cv[k]
for k in cv.keys():
if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
continue
cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
for k in keys:
if cv[k] <= 0:
del cv[k]
cv["tob_resume_id"] = str(cv["tob_resume_id"])
cv["id"] = cv["tob_resume_id"]
logging.debug("CCCCCCCCCCCCCCC")
return dealWithInt64(cv)
def dealWithInt64(d):
if isinstance(d, dict):
for n, v in d.items():
d[n] = dealWithInt64(v)
if isinstance(d, list):
d = [dealWithInt64(t) for t in d]
if isinstance(d, np.integer):
d = int(d)
return d

View File

@ -1,64 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
from deepdoc.parser.utils import get_text
from rag.nlp import num_tokens_from_string
class RAGFlowTxtParser:
def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"):
txt = get_text(fnm, binary)
return self.parser_txt(txt, chunk_token_num, delimiter)
@classmethod
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
if not isinstance(txt, str):
raise TypeError("txt type should be str!")
cks = [""]
tk_nums = [0]
delimiter = delimiter.encode('utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
def add_chunk(t):
nonlocal cks, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if tk_nums[-1] > chunk_token_num:
cks.append(t)
tk_nums.append(tnum)
else:
cks[-1] += t
tk_nums[-1] += tnum
dels = []
s = 0
for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
f, t = m.span()
dels.append(m.group(1))
dels.extend(list(delimiter[s: f]))
s = t
if s < len(delimiter):
dels.extend(list(delimiter[s:]))
dels = [re.escape(d) for d in dels if d]
dels = [d for d in dels if d]
dels = "|".join(dels)
secs = re.split(r"(%s)" % dels, txt)
for sec in secs:
if re.match(f"^{dels}$", sec):
continue
add_chunk(sec)
return [[c, ""] for c in cks]

View File

@ -1,32 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from rag.nlp import find_codec
def get_text(fnm: str, binary=None) -> str:
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r") as f:
while True:
line = f.readline()
if not line:
break
txt += line
return txt

View File

@ -1,86 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import io
import sys
import threading
import pdfplumber
from .ocr import OCR
from .recognizer import Recognizer
from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
from .table_structure_recognizer import TableStructureRecognizer
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
def init_in_out(args):
from PIL import Image
import os
import traceback
from api.utils.file_utils import traversal_files
images = []
outputs = []
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
def pdf_pages(fnm, zoomin=3):
nonlocal outputs, images
with sys.modules[LOCK_KEY_pdfplumber]:
pdf = pdfplumber.open(fnm)
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
enumerate(pdf.pages)]
for i, page in enumerate(images):
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
pdf.close()
def images_and_outputs(fnm):
nonlocal outputs, images
if fnm.split(".")[-1].lower() == "pdf":
pdf_pages(fnm)
return
try:
fp = open(fnm, 'rb')
binary = fp.read()
fp.close()
images.append(Image.open(io.BytesIO(binary)).convert('RGB'))
outputs.append(os.path.split(fnm)[-1])
except Exception:
traceback.print_exc()
if os.path.isdir(args.inputs):
for fnm in traversal_files(args.inputs):
images_and_outputs(fnm)
else:
images_and_outputs(args.inputs)
for i in range(len(outputs)):
outputs[i] = os.path.join(args.output_dir, outputs[i])
return images, outputs
__all__ = [
"OCR",
"Recognizer",
"LayoutRecognizer",
"TableStructureRecognizer",
"init_in_out",
]

View File

@ -1,245 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import re
from collections import Counter
from copy import deepcopy
import cv2
import numpy as np
from huggingface_hub import snapshot_download
from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import Recognizer
from deepdoc.vision.operators import nms
class LayoutRecognizer(Recognizer):
labels = [
"_background_",
"Text",
"Title",
"Figure",
"Figure caption",
"Table",
"Table caption",
"Header",
"Footer",
"Reference",
"Equation",
]
def __init__(self, domain):
try:
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc")
super().__init__(self.labels, domain, model_dir)
except Exception:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
local_dir_use_symlinks=False)
super().__init__(self.labels, domain, model_dir)
self.garbage_layouts = ["footer", "header", "reference"]
def __call__(self, image_list, ocr_res, scale_factor=3,
thr=0.2, batch_size=16, drop=True):
def __is_garbage(b):
patt = [r"^•+$", r"(版权归©|免责条款|地址[:])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
"(资料|数据)来源[:]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
"\\(cid *: *[0-9]+ *\\)"
]
return any([re.search(p, b["text"]) for p in patt])
layouts = super().__call__(image_list, thr, batch_size)
# save_results(image_list, layouts, self.labels, output_dir='output/', threshold=0.7)
assert len(image_list) == len(ocr_res)
# Tag layout type
boxes = []
assert len(image_list) == len(layouts)
garbages = {}
page_layout = []
for pn, lts in enumerate(layouts):
bxs = ocr_res[pn]
lts = [{"type": b["type"],
"score": float(b["score"]),
"x0": b["bbox"][0] / scale_factor, "x1": b["bbox"][2] / scale_factor,
"top": b["bbox"][1] / scale_factor, "bottom": b["bbox"][-1] / scale_factor,
"page_number": pn,
} for b in lts if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts]
lts = self.sort_Y_firstly(lts, np.mean(
[lt["bottom"] - lt["top"] for lt in lts]) / 2)
lts = self.layouts_cleanup(bxs, lts)
page_layout.append(lts)
# Tag layout type, layouts are ready
def findLayout(ty):
nonlocal bxs, lts, self
lts_ = [lt for lt in lts if lt["type"] == ty]
i = 0
while i < len(bxs):
if bxs[i].get("layout_type"):
i += 1
continue
if __is_garbage(bxs[i]):
bxs.pop(i)
continue
ii = self.find_overlapped_with_threashold(bxs[i], lts_,
thr=0.4)
if ii is None: # belong to nothing
bxs[i]["layout_type"] = ""
i += 1
continue
lts_[ii]["visited"] = True
keep_feats = [
lts_[
ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
lts_[
ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
]
if drop and lts_[
ii]["type"] in self.garbage_layouts and not any(keep_feats):
if lts_[ii]["type"] not in garbages:
garbages[lts_[ii]["type"]] = []
garbages[lts_[ii]["type"]].append(bxs[i]["text"])
bxs.pop(i)
continue
bxs[i]["layoutno"] = f"{ty}-{ii}"
bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[
ii]["type"] != "equation" else "figure"
i += 1
for lt in ["footer", "header", "reference", "figure caption",
"table caption", "title", "table", "text", "figure", "equation"]:
findLayout(lt)
# add box to figure layouts which has not text box
for i, lt in enumerate(
[lt for lt in lts if lt["type"] in ["figure", "equation"]]):
if lt.get("visited"):
continue
lt = deepcopy(lt)
del lt["type"]
lt["text"] = ""
lt["layout_type"] = "figure"
lt["layoutno"] = f"figure-{i}"
bxs.append(lt)
boxes.extend(bxs)
ocr_res = boxes
garbag_set = set()
for k in garbages.keys():
garbages[k] = Counter(garbages[k])
for g, c in garbages[k].items():
if c > 1:
garbag_set.add(g)
ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
return ocr_res, page_layout
def forward(self, image_list, thr=0.7, batch_size=16):
return super().__call__(image_list, thr, batch_size)
class LayoutRecognizer4YOLOv10(LayoutRecognizer):
labels = [
"title",
"Text",
"Reference",
"Figure",
"Figure caption",
"Table",
"Table caption",
"Table caption",
"Equation",
"Figure caption",
]
def __init__(self, domain):
domain = "layout"
super().__init__(domain)
self.auto = False
self.scaleFill = False
self.scaleup = True
self.stride = 32
self.center = True
def preprocess(self, image_list):
inputs = []
new_shape = self.input_shape # height, width
for img in image_list:
shape = img.shape[:2]# current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
dw /= 2 # divide padding into 2 sides
dh /= 2
ww, hh = new_unpad
img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
img = cv2.copyMakeBorder(
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
) # add border
img /= 255.0
img = img.transpose(2, 0, 1)
img = img[np.newaxis, :, :, :].astype(np.float32)
inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})
return inputs
def postprocess(self, boxes, inputs, thr):
thr = 0.08
boxes = np.squeeze(boxes)
scores = boxes[:, 4]
boxes = boxes[scores > thr, :]
scores = scores[scores > thr]
if len(boxes) == 0:
return []
class_ids = boxes[:, -1].astype(int)
boxes = boxes[:, :4]
boxes[:, 0] -= inputs["scale_factor"][2]
boxes[:, 2] -= inputs["scale_factor"][2]
boxes[:, 1] -= inputs["scale_factor"][3]
boxes[:, 3] -= inputs["scale_factor"][3]
input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
inputs["scale_factor"][1]])
boxes = np.multiply(boxes, input_shape, dtype=np.float32)
unique_class_ids = np.unique(class_ids)
indices = []
for class_id in unique_class_ids:
class_indices = np.where(class_ids == class_id)[0]
class_boxes = boxes[class_indices, :]
class_scores = scores[class_indices]
class_keep_boxes = nms(class_boxes, class_scores, 0.45)
indices.extend(class_indices[class_keep_boxes])
return [{
"type": self.label_list[class_ids[i]].lower(),
"bbox": [float(t) for t in boxes[i].tolist()],
"score": float(scores[i])
} for i in indices]

View File

@ -1,702 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import copy
import time
import os
from huggingface_hub import snapshot_download
from api.utils.file_utils import get_project_base_directory
from .operators import * # noqa: F403
from . import operators
import math
import numpy as np
import cv2
import onnxruntime as ort
from .postprocess import build_post_process
loaded_models = {}
def transform(data, ops=None):
""" transform """
if ops is None:
ops = []
for op in ops:
data = op(data)
if data is None:
return None
return data
def create_operators(op_param_list, global_config=None):
"""
create operators based on the config
Args:
params(list): a dict list, used to create some operators
"""
assert isinstance(
op_param_list, list), ('operator config should be a list')
ops = []
for operator in op_param_list:
assert isinstance(operator,
dict) and len(operator) == 1, "yaml format error"
op_name = list(operator)[0]
param = {} if operator[op_name] is None else operator[op_name]
if global_config is not None:
param.update(global_config)
op = getattr(operators, op_name)(**param)
ops.append(op)
return ops
def load_model(model_dir, nm):
model_file_path = os.path.join(model_dir, nm + ".onnx")
global loaded_models
loaded_model = loaded_models.get(model_file_path)
if loaded_model:
logging.info(f"load_model {model_file_path} reuses cached model")
return loaded_model
if not os.path.exists(model_file_path):
raise ValueError("not find model file path {}".format(
model_file_path))
def cuda_is_available():
try:
import torch
if torch.cuda.is_available():
return True
except Exception:
return False
return False
options = ort.SessionOptions()
options.enable_cpu_mem_arena = False
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
options.intra_op_num_threads = 2
options.inter_op_num_threads = 2
# https://github.com/microsoft/onnxruntime/issues/9509#issuecomment-951546580
# Shrink GPU memory after execution
run_options = ort.RunOptions()
if cuda_is_available():
cuda_provider_options = {
"device_id": 0, # Use specific GPU
"gpu_mem_limit": 512 * 1024 * 1024, # Limit gpu memory
"arena_extend_strategy": "kNextPowerOfTwo", # gpu memory allocation strategy
}
sess = ort.InferenceSession(
model_file_path,
options=options,
providers=['CUDAExecutionProvider'],
provider_options=[cuda_provider_options]
)
run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "gpu:0")
logging.info(f"load_model {model_file_path} uses GPU")
else:
sess = ort.InferenceSession(
model_file_path,
options=options,
providers=['CPUExecutionProvider'])
run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "cpu")
logging.info(f"load_model {model_file_path} uses CPU")
loaded_model = (sess, run_options)
loaded_models[model_file_path] = loaded_model
return loaded_model
class TextRecognizer:
"""
文本识别器类用于识别检测到的文本区域中的具体文字内容
该类使用基于CTC(Connectionist Temporal Classification)的文本识别模型
能够将图像中的文本区域转换为文字内容
"""
def __init__(self, model_dir):
"""
初始化文本识别器
参数:
model_dir: 模型文件所在目录
"""
self.rec_image_shape = [int(v) for v in "3, 48, 320".split(",")]
self.rec_batch_num = 16
postprocess_params = {
'name': 'CTCLabelDecode',
"character_dict_path": os.path.join(model_dir, "ocr.res"),
"use_space_char": True
}
self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.run_options = load_model(model_dir, 'rec')
self.input_tensor = self.predictor.get_inputs()[0]
def resize_norm_img(self, img, max_wh_ratio):
"""
调整图像大小并进行归一化处理保持宽高比
参数:
img: 输入图像
max_wh_ratio: 最大宽高比
返回:
处理后的图像张量
"""
imgC, imgH, imgW = self.rec_image_shape
assert imgC == img.shape[2]
imgW = int((imgH * max_wh_ratio))
w = self.input_tensor.shape[3:][0]
if isinstance(w, str):
pass
elif w is not None and w > 0:
imgW = w
h, w = img.shape[:2]
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32')
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
def resize_norm_img_vl(self, img, image_shape):
imgC, imgH, imgW = image_shape
img = img[:, :, ::-1] # bgr2rgb
resized_image = cv2.resize(
img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
resized_image = resized_image.astype('float32')
resized_image = resized_image.transpose((2, 0, 1)) / 255
return resized_image
def resize_norm_img_srn(self, img, image_shape):
imgC, imgH, imgW = image_shape
img_black = np.zeros((imgH, imgW))
im_hei = img.shape[0]
im_wid = img.shape[1]
if im_wid <= im_hei * 1:
img_new = cv2.resize(img, (imgH * 1, imgH))
elif im_wid <= im_hei * 2:
img_new = cv2.resize(img, (imgH * 2, imgH))
elif im_wid <= im_hei * 3:
img_new = cv2.resize(img, (imgH * 3, imgH))
else:
img_new = cv2.resize(img, (imgW, imgH))
img_np = np.asarray(img_new)
img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
img_black[:, 0:img_np.shape[1]] = img_np
img_black = img_black[:, :, np.newaxis]
row, col, c = img_black.shape
c = 1
return np.reshape(img_black, (c, row, col)).astype(np.float32)
def srn_other_inputs(self, image_shape, num_heads, max_text_length):
imgC, imgH, imgW = image_shape
feature_dim = int((imgH / 8) * (imgW / 8))
encoder_word_pos = np.array(range(0, feature_dim)).reshape(
(feature_dim, 1)).astype('int64')
gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
(max_text_length, 1)).astype('int64')
gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
[-1, 1, max_text_length, max_text_length])
gsrm_slf_attn_bias1 = np.tile(
gsrm_slf_attn_bias1,
[1, num_heads, 1, 1]).astype('float32') * [-1e9]
gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
[-1, 1, max_text_length, max_text_length])
gsrm_slf_attn_bias2 = np.tile(
gsrm_slf_attn_bias2,
[1, num_heads, 1, 1]).astype('float32') * [-1e9]
encoder_word_pos = encoder_word_pos[np.newaxis, :]
gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
return [
encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
gsrm_slf_attn_bias2
]
def process_image_srn(self, img, image_shape, num_heads, max_text_length):
norm_img = self.resize_norm_img_srn(img, image_shape)
norm_img = norm_img[np.newaxis, :]
[encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
self.srn_other_inputs(image_shape, num_heads, max_text_length)
gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
encoder_word_pos = encoder_word_pos.astype(np.int64)
gsrm_word_pos = gsrm_word_pos.astype(np.int64)
return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
gsrm_slf_attn_bias2)
def resize_norm_img_sar(self, img, image_shape,
width_downsample_ratio=0.25):
imgC, imgH, imgW_min, imgW_max = image_shape
h = img.shape[0]
w = img.shape[1]
valid_ratio = 1.0
# make sure new_width is an integral multiple of width_divisor.
width_divisor = int(1 / width_downsample_ratio)
# resize
ratio = w / float(h)
resize_w = math.ceil(imgH * ratio)
if resize_w % width_divisor != 0:
resize_w = round(resize_w / width_divisor) * width_divisor
if imgW_min is not None:
resize_w = max(imgW_min, resize_w)
if imgW_max is not None:
valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
resize_w = min(imgW_max, resize_w)
resized_image = cv2.resize(img, (resize_w, imgH))
resized_image = resized_image.astype('float32')
# norm
if image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
resize_shape = resized_image.shape
padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
padding_im[:, :, 0:resize_w] = resized_image
pad_shape = padding_im.shape
return padding_im, resize_shape, pad_shape, valid_ratio
def resize_norm_img_spin(self, img):
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# return padding_im
img = cv2.resize(img, tuple([100, 32]), cv2.INTER_CUBIC)
img = np.array(img, np.float32)
img = np.expand_dims(img, -1)
img = img.transpose((2, 0, 1))
mean = [127.5]
std = [127.5]
mean = np.array(mean, dtype=np.float32)
std = np.array(std, dtype=np.float32)
mean = np.float32(mean.reshape(1, -1))
stdinv = 1 / np.float32(std.reshape(1, -1))
img -= mean
img *= stdinv
return img
def resize_norm_img_svtr(self, img, image_shape):
imgC, imgH, imgW = image_shape
resized_image = cv2.resize(
img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
resized_image = resized_image.astype('float32')
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
return resized_image
def resize_norm_img_abinet(self, img, image_shape):
imgC, imgH, imgW = image_shape
resized_image = cv2.resize(
img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
resized_image = resized_image.astype('float32')
resized_image = resized_image / 255.
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
resized_image = (
resized_image - mean[None, None, ...]) / std[None, None, ...]
resized_image = resized_image.transpose((2, 0, 1))
resized_image = resized_image.astype('float32')
return resized_image
def norm_img_can(self, img, image_shape):
img = cv2.cvtColor(
img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image
if self.rec_image_shape[0] == 1:
h, w = img.shape
_, imgH, imgW = self.rec_image_shape
if h < imgH or w < imgW:
padding_h = max(imgH - h, 0)
padding_w = max(imgW - w, 0)
img_padded = np.pad(img, ((0, padding_h), (0, padding_w)),
'constant',
constant_values=(255))
img = img_padded
img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w
img = img.astype('float32')
return img
def __call__(self, img_list):
img_num = len(img_list)
# Calculate the aspect ratio of all text bars
width_list = []
for img in img_list:
width_list.append(img.shape[1] / float(img.shape[0]))
# Sorting can speed up the recognition process
indices = np.argsort(np.array(width_list))
rec_res = [['', 0.0]] * img_num
batch_num = self.rec_batch_num
st = time.time()
for beg_img_no in range(0, img_num, batch_num):
end_img_no = min(img_num, beg_img_no + batch_num)
norm_img_batch = []
imgC, imgH, imgW = self.rec_image_shape[:3]
max_wh_ratio = imgW / imgH
# max_wh_ratio = 0
for ino in range(beg_img_no, end_img_no):
h, w = img_list[indices[ino]].shape[0:2]
wh_ratio = w * 1.0 / h
max_wh_ratio = max(max_wh_ratio, wh_ratio)
for ino in range(beg_img_no, end_img_no):
norm_img = self.resize_norm_img(img_list[indices[ino]],
max_wh_ratio)
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
norm_img_batch = np.concatenate(norm_img_batch)
norm_img_batch = norm_img_batch.copy()
input_dict = {}
input_dict[self.input_tensor.name] = norm_img_batch
for i in range(100000):
try:
outputs = self.predictor.run(None, input_dict, self.run_options)
break
except Exception as e:
if i >= 3:
raise e
time.sleep(5)
preds = outputs[0]
rec_result = self.postprocess_op(preds)
for rno in range(len(rec_result)):
rec_res[indices[beg_img_no + rno]] = rec_result[rno]
return rec_res, time.time() - st
class TextDetector:
"""
文本检测器类用于检测图像中的文本区域
该类使用基于DB(Differentiable Binarization)的文本检测模型
能够准确定位图像中的文本区域并返回文本框的坐标信息
"""
def __init__(self, model_dir):
pre_process_list = [{
'DetResizeForTest': {
'limit_side_len': 960,
'limit_type': "max",
}
}, {
'NormalizeImage': {
'std': [0.229, 0.224, 0.225],
'mean': [0.485, 0.456, 0.406],
'scale': '1./255.',
'order': 'hwc'
}
}, {
'ToCHWImage': None
}, {
'KeepKeys': {
'keep_keys': ['image', 'shape']
}
}]
postprocess_params = {"name": "DBPostProcess", "thresh": 0.3, "box_thresh": 0.5, "max_candidates": 1000,
"unclip_ratio": 1.5, "use_dilation": False, "score_mode": "fast", "box_type": "quad"}
self.postprocess_op = build_post_process(postprocess_params)
self.predictor, self.run_options = load_model(model_dir, 'det')
self.input_tensor = self.predictor.get_inputs()[0]
img_h, img_w = self.input_tensor.shape[2:]
if isinstance(img_h, str) or isinstance(img_w, str):
pass
elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
pre_process_list[0] = {
'DetResizeForTest': {
'image_shape': [img_h, img_w]
}
}
self.preprocess_op = create_operators(pre_process_list)
def order_points_clockwise(self, pts):
rect = np.zeros((4, 2), dtype="float32")
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
diff = np.diff(np.array(tmp), axis=1)
rect[1] = tmp[np.argmin(diff)]
rect[3] = tmp[np.argmax(diff)]
return rect
def clip_det_res(self, points, img_height, img_width):
for pno in range(points.shape[0]):
points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
return points
def filter_tag_det_res(self, dt_boxes, image_shape):
img_height, img_width = image_shape[0:2]
dt_boxes_new = []
for box in dt_boxes:
if isinstance(box, list):
box = np.array(box)
box = self.order_points_clockwise(box)
box = self.clip_det_res(box, img_height, img_width)
rect_width = int(np.linalg.norm(box[0] - box[1]))
rect_height = int(np.linalg.norm(box[0] - box[3]))
if rect_width <= 3 or rect_height <= 3:
continue
dt_boxes_new.append(box)
dt_boxes = np.array(dt_boxes_new)
return dt_boxes
def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
img_height, img_width = image_shape[0:2]
dt_boxes_new = []
for box in dt_boxes:
if isinstance(box, list):
box = np.array(box)
box = self.clip_det_res(box, img_height, img_width)
dt_boxes_new.append(box)
dt_boxes = np.array(dt_boxes_new)
return dt_boxes
def __call__(self, img):
ori_im = img.copy()
data = {'image': img}
st = time.time()
data = transform(data, self.preprocess_op)
img, shape_list = data
if img is None:
return None, 0
img = np.expand_dims(img, axis=0)
shape_list = np.expand_dims(shape_list, axis=0)
img = img.copy()
input_dict = {}
input_dict[self.input_tensor.name] = img
for i in range(100000):
try:
outputs = self.predictor.run(None, input_dict, self.run_options)
break
except Exception as e:
if i >= 3:
raise e
time.sleep(5)
post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
dt_boxes = post_result[0]['points']
dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
return dt_boxes, time.time() - st
class OCR:
def __init__(self, model_dir=None):
"""
If you have trouble downloading HuggingFace models, -_^ this might help!!
For Linux:
export HF_ENDPOINT=https://hf-mirror.com
For Windows:
Good luck
^_-
"""
if not model_dir:
try:
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc")
self.text_detector = TextDetector(model_dir)
self.text_recognizer = TextRecognizer(model_dir)
except Exception:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
local_dir_use_symlinks=False)
self.text_detector = TextDetector(model_dir)
self.text_recognizer = TextRecognizer(model_dir)
self.drop_score = 0.5
self.crop_image_res_index = 0
def get_rotate_crop_image(self, img, points):
'''
img_height, img_width = img.shape[0:2]
left = int(np.min(points[:, 0]))
right = int(np.max(points[:, 0]))
top = int(np.min(points[:, 1]))
bottom = int(np.max(points[:, 1]))
img_crop = img[top:bottom, left:right, :].copy()
points[:, 0] = points[:, 0] - left
points[:, 1] = points[:, 1] - top
'''
assert len(points) == 4, "shape of points must be 4*2"
img_crop_width = int(
max(
np.linalg.norm(points[0] - points[1]),
np.linalg.norm(points[2] - points[3])))
img_crop_height = int(
max(
np.linalg.norm(points[0] - points[3]),
np.linalg.norm(points[1] - points[2])))
pts_std = np.float32([[0, 0], [img_crop_width, 0],
[img_crop_width, img_crop_height],
[0, img_crop_height]])
M = cv2.getPerspectiveTransform(points, pts_std)
dst_img = cv2.warpPerspective(
img,
M, (img_crop_width, img_crop_height),
borderMode=cv2.BORDER_REPLICATE,
flags=cv2.INTER_CUBIC)
dst_img_height, dst_img_width = dst_img.shape[0:2]
if dst_img_height * 1.0 / dst_img_width >= 1.5:
dst_img = np.rot90(dst_img)
return dst_img
def sorted_boxes(self, dt_boxes):
"""
Sort text boxes in order from top to bottom, left to right
args:
dt_boxes(array):detected text boxes with shape [4, 2]
return:
sorted boxes(array) with shape [4, 2]
"""
num_boxes = dt_boxes.shape[0]
sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
_boxes = list(sorted_boxes)
for i in range(num_boxes - 1):
for j in range(i, -1, -1):
if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
(_boxes[j + 1][0][0] < _boxes[j][0][0]):
tmp = _boxes[j]
_boxes[j] = _boxes[j + 1]
_boxes[j + 1] = tmp
else:
break
return _boxes
def detect(self, img):
time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
if img is None:
return None, None, time_dict
start = time.time()
dt_boxes, elapse = self.text_detector(img)
time_dict['det'] = elapse
if dt_boxes is None:
end = time.time()
time_dict['all'] = end - start
return None, None, time_dict
return zip(self.sorted_boxes(dt_boxes), [
("", 0) for _ in range(len(dt_boxes))])
def recognize(self, ori_im, box):
img_crop = self.get_rotate_crop_image(ori_im, box)
rec_res, elapse = self.text_recognizer([img_crop])
text, score = rec_res[0]
if score < self.drop_score:
return ""
return text
def recognize_batch(self, img_list):
rec_res, elapse = self.text_recognizer(img_list)
texts = []
for i in range(len(rec_res)):
text, score = rec_res[i]
if score < self.drop_score:
text = ""
texts.append(text)
return texts
def __call__(self, img, cls=True):
time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
if img is None:
return None, None, time_dict
start = time.time()
ori_im = img.copy()
dt_boxes, elapse = self.text_detector(img)
time_dict['det'] = elapse
if dt_boxes is None:
end = time.time()
time_dict['all'] = end - start
return None, None, time_dict
img_crop_list = []
dt_boxes = self.sorted_boxes(dt_boxes)
for bno in range(len(dt_boxes)):
tmp_box = copy.deepcopy(dt_boxes[bno])
img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
img_crop_list.append(img_crop)
rec_res, elapse = self.text_recognizer(img_crop_list)
time_dict['rec'] = elapse
filter_boxes, filter_rec_res = [], []
for box, rec_result in zip(dt_boxes, rec_res):
text, score = rec_result
if score >= self.drop_score:
filter_boxes.append(box)
filter_rec_res.append(rec_result)
end = time.time()
time_dict['all'] = end - start
# for bno in range(len(img_crop_list)):
# print(f"{bno}, {rec_res[bno]}")
return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))

View File

@ -1,725 +0,0 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import sys
import six
import cv2
import numpy as np
import math
from PIL import Image
class DecodeImage:
""" decode image """
def __init__(self,
img_mode='RGB',
channel_first=False,
ignore_orientation=False,
**kwargs):
self.img_mode = img_mode
self.channel_first = channel_first
self.ignore_orientation = ignore_orientation
def __call__(self, data):
img = data['image']
if six.PY2:
assert isinstance(img, str) and len(
img) > 0, "invalid input 'img' in DecodeImage"
else:
assert isinstance(img, bytes) and len(
img) > 0, "invalid input 'img' in DecodeImage"
img = np.frombuffer(img, dtype='uint8')
if self.ignore_orientation:
img = cv2.imdecode(img, cv2.IMREAD_IGNORE_ORIENTATION |
cv2.IMREAD_COLOR)
else:
img = cv2.imdecode(img, 1)
if img is None:
return None
if self.img_mode == 'GRAY':
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
elif self.img_mode == 'RGB':
assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
img.shape)
img = img[:, :, ::-1]
if self.channel_first:
img = img.transpose((2, 0, 1))
data['image'] = img
return data
class StandardizeImag:
"""normalize image
Args:
mean (list): im - mean
std (list): im / std
is_scale (bool): whether need im / 255
norm_type (str): type in ['mean_std', 'none']
"""
def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
self.mean = mean
self.std = std
self.is_scale = is_scale
self.norm_type = norm_type
def __call__(self, im, im_info):
"""
Args:
im (np.ndarray): image (np.ndarray)
im_info (dict): info of image
Returns:
im (np.ndarray): processed image (np.ndarray)
im_info (dict): info of processed image
"""
im = im.astype(np.float32, copy=False)
if self.is_scale:
scale = 1.0 / 255.0
im *= scale
if self.norm_type == 'mean_std':
mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
std = np.array(self.std)[np.newaxis, np.newaxis, :]
im -= mean
im /= std
return im, im_info
class NormalizeImage:
""" normalize image such as subtract mean, divide std
"""
def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
if isinstance(scale, str):
scale = eval(scale)
self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
mean = mean if mean is not None else [0.485, 0.456, 0.406]
std = std if std is not None else [0.229, 0.224, 0.225]
shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
self.mean = np.array(mean).reshape(shape).astype('float32')
self.std = np.array(std).reshape(shape).astype('float32')
def __call__(self, data):
img = data['image']
from PIL import Image
if isinstance(img, Image.Image):
img = np.array(img)
assert isinstance(img,
np.ndarray), "invalid input 'img' in NormalizeImage"
data['image'] = (
img.astype('float32') * self.scale - self.mean) / self.std
return data
class ToCHWImage:
""" convert hwc image to chw image
"""
def __init__(self, **kwargs):
pass
def __call__(self, data):
img = data['image']
from PIL import Image
if isinstance(img, Image.Image):
img = np.array(img)
data['image'] = img.transpose((2, 0, 1))
return data
class KeepKeys:
def __init__(self, keep_keys, **kwargs):
self.keep_keys = keep_keys
def __call__(self, data):
data_list = []
for key in self.keep_keys:
data_list.append(data[key])
return data_list
class Pad:
def __init__(self, size=None, size_div=32, **kwargs):
if size is not None and not isinstance(size, (int, list, tuple)):
raise TypeError("Type of target_size is invalid. Now is {}".format(
type(size)))
if isinstance(size, int):
size = [size, size]
self.size = size
self.size_div = size_div
def __call__(self, data):
img = data['image']
img_h, img_w = img.shape[0], img.shape[1]
if self.size:
resize_h2, resize_w2 = self.size
assert (
img_h < resize_h2 and img_w < resize_w2
), '(h, w) of target size should be greater than (img_h, img_w)'
else:
resize_h2 = max(
int(math.ceil(img.shape[0] / self.size_div) * self.size_div),
self.size_div)
resize_w2 = max(
int(math.ceil(img.shape[1] / self.size_div) * self.size_div),
self.size_div)
img = cv2.copyMakeBorder(
img,
0,
resize_h2 - img_h,
0,
resize_w2 - img_w,
cv2.BORDER_CONSTANT,
value=0)
data['image'] = img
return data
class LinearResize:
"""resize image by target_size and max_size
Args:
target_size (int): the target size of image
keep_ratio (bool): whether keep_ratio or not, default true
interp (int): method of resize
"""
def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
if isinstance(target_size, int):
target_size = [target_size, target_size]
self.target_size = target_size
self.keep_ratio = keep_ratio
self.interp = interp
def __call__(self, im, im_info):
"""
Args:
im (np.ndarray): image (np.ndarray)
im_info (dict): info of image
Returns:
im (np.ndarray): processed image (np.ndarray)
im_info (dict): info of processed image
"""
assert len(self.target_size) == 2
assert self.target_size[0] > 0 and self.target_size[1] > 0
_im_channel = im.shape[2]
im_scale_y, im_scale_x = self.generate_scale(im)
im = cv2.resize(
im,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
im_info['scale_factor'] = np.array(
[im_scale_y, im_scale_x]).astype('float32')
return im, im_info
def generate_scale(self, im):
"""
Args:
im (np.ndarray): image (np.ndarray)
Returns:
im_scale_x: the resize ratio of X
im_scale_y: the resize ratio of Y
"""
origin_shape = im.shape[:2]
_im_c = im.shape[2]
if self.keep_ratio:
im_size_min = np.min(origin_shape)
im_size_max = np.max(origin_shape)
target_size_min = np.min(self.target_size)
target_size_max = np.max(self.target_size)
im_scale = float(target_size_min) / float(im_size_min)
if np.round(im_scale * im_size_max) > target_size_max:
im_scale = float(target_size_max) / float(im_size_max)
im_scale_x = im_scale
im_scale_y = im_scale
else:
resize_h, resize_w = self.target_size
im_scale_y = resize_h / float(origin_shape[0])
im_scale_x = resize_w / float(origin_shape[1])
return im_scale_y, im_scale_x
class Resize:
def __init__(self, size=(640, 640), **kwargs):
self.size = size
def resize_image(self, img):
resize_h, resize_w = self.size
ori_h, ori_w = img.shape[:2] # (h, w, c)
ratio_h = float(resize_h) / ori_h
ratio_w = float(resize_w) / ori_w
img = cv2.resize(img, (int(resize_w), int(resize_h)))
return img, [ratio_h, ratio_w]
def __call__(self, data):
img = data['image']
if 'polys' in data:
text_polys = data['polys']
img_resize, [ratio_h, ratio_w] = self.resize_image(img)
if 'polys' in data:
new_boxes = []
for box in text_polys:
new_box = []
for cord in box:
new_box.append([cord[0] * ratio_w, cord[1] * ratio_h])
new_boxes.append(new_box)
data['polys'] = np.array(new_boxes, dtype=np.float32)
data['image'] = img_resize
return data
class DetResizeForTest:
def __init__(self, **kwargs):
super(DetResizeForTest, self).__init__()
self.resize_type = 0
self.keep_ratio = False
if 'image_shape' in kwargs:
self.image_shape = kwargs['image_shape']
self.resize_type = 1
if 'keep_ratio' in kwargs:
self.keep_ratio = kwargs['keep_ratio']
elif 'limit_side_len' in kwargs:
self.limit_side_len = kwargs['limit_side_len']
self.limit_type = kwargs.get('limit_type', 'min')
elif 'resize_long' in kwargs:
self.resize_type = 2
self.resize_long = kwargs.get('resize_long', 960)
else:
self.limit_side_len = 736
self.limit_type = 'min'
def __call__(self, data):
img = data['image']
src_h, src_w, _ = img.shape
if sum([src_h, src_w]) < 64:
img = self.image_padding(img)
if self.resize_type == 0:
# img, shape = self.resize_image_type0(img)
img, [ratio_h, ratio_w] = self.resize_image_type0(img)
elif self.resize_type == 2:
img, [ratio_h, ratio_w] = self.resize_image_type2(img)
else:
# img, shape = self.resize_image_type1(img)
img, [ratio_h, ratio_w] = self.resize_image_type1(img)
data['image'] = img
data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
return data
def image_padding(self, im, value=0):
h, w, c = im.shape
im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value
im_pad[:h, :w, :] = im
return im_pad
def resize_image_type1(self, img):
resize_h, resize_w = self.image_shape
ori_h, ori_w = img.shape[:2] # (h, w, c)
if self.keep_ratio is True:
resize_w = ori_w * resize_h / ori_h
N = math.ceil(resize_w / 32)
resize_w = N * 32
ratio_h = float(resize_h) / ori_h
ratio_w = float(resize_w) / ori_w
img = cv2.resize(img, (int(resize_w), int(resize_h)))
# return img, np.array([ori_h, ori_w])
return img, [ratio_h, ratio_w]
def resize_image_type0(self, img):
"""
resize image to a size multiple of 32 which is required by the network
args:
img(array): array with shape [h, w, c]
return(tuple):
img, (ratio_h, ratio_w)
"""
limit_side_len = self.limit_side_len
h, w, c = img.shape
# limit the max side
if self.limit_type == 'max':
if max(h, w) > limit_side_len:
if h > w:
ratio = float(limit_side_len) / h
else:
ratio = float(limit_side_len) / w
else:
ratio = 1.
elif self.limit_type == 'min':
if min(h, w) < limit_side_len:
if h < w:
ratio = float(limit_side_len) / h
else:
ratio = float(limit_side_len) / w
else:
ratio = 1.
elif self.limit_type == 'resize_long':
ratio = float(limit_side_len) / max(h, w)
else:
raise Exception('not support limit type, image ')
resize_h = int(h * ratio)
resize_w = int(w * ratio)
resize_h = max(int(round(resize_h / 32) * 32), 32)
resize_w = max(int(round(resize_w / 32) * 32), 32)
try:
if int(resize_w) <= 0 or int(resize_h) <= 0:
return None, (None, None)
img = cv2.resize(img, (int(resize_w), int(resize_h)))
except BaseException:
logging.exception("{} {} {}".format(img.shape, resize_w, resize_h))
sys.exit(0)
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
return img, [ratio_h, ratio_w]
def resize_image_type2(self, img):
h, w, _ = img.shape
resize_w = w
resize_h = h
if resize_h > resize_w:
ratio = float(self.resize_long) / resize_h
else:
ratio = float(self.resize_long) / resize_w
resize_h = int(resize_h * ratio)
resize_w = int(resize_w * ratio)
max_stride = 128
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
img = cv2.resize(img, (int(resize_w), int(resize_h)))
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
return img, [ratio_h, ratio_w]
class E2EResizeForTest:
def __init__(self, **kwargs):
super(E2EResizeForTest, self).__init__()
self.max_side_len = kwargs['max_side_len']
self.valid_set = kwargs['valid_set']
def __call__(self, data):
img = data['image']
src_h, src_w, _ = img.shape
if self.valid_set == 'totaltext':
im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext(
img, max_side_len=self.max_side_len)
else:
im_resized, (ratio_h, ratio_w) = self.resize_image(
img, max_side_len=self.max_side_len)
data['image'] = im_resized
data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
return data
def resize_image_for_totaltext(self, im, max_side_len=512):
h, w, _ = im.shape
resize_w = w
resize_h = h
ratio = 1.25
if h * ratio > max_side_len:
ratio = float(max_side_len) / resize_h
resize_h = int(resize_h * ratio)
resize_w = int(resize_w * ratio)
max_stride = 128
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
im = cv2.resize(im, (int(resize_w), int(resize_h)))
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
return im, (ratio_h, ratio_w)
def resize_image(self, im, max_side_len=512):
"""
resize image to a size multiple of max_stride which is required by the network
:param im: the resized image
:param max_side_len: limit of max image size to avoid out of memory in gpu
:return: the resized image and the resize ratio
"""
h, w, _ = im.shape
resize_w = w
resize_h = h
# Fix the longer side
if resize_h > resize_w:
ratio = float(max_side_len) / resize_h
else:
ratio = float(max_side_len) / resize_w
resize_h = int(resize_h * ratio)
resize_w = int(resize_w * ratio)
max_stride = 128
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
im = cv2.resize(im, (int(resize_w), int(resize_h)))
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
return im, (ratio_h, ratio_w)
class KieResize:
def __init__(self, **kwargs):
super(KieResize, self).__init__()
self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[
'img_scale'][1]
def __call__(self, data):
img = data['image']
points = data['points']
src_h, src_w, _ = img.shape
im_resized, scale_factor, [ratio_h, ratio_w
], [new_h, new_w] = self.resize_image(img)
resize_points = self.resize_boxes(img, points, scale_factor)
data['ori_image'] = img
data['ori_boxes'] = points
data['points'] = resize_points
data['image'] = im_resized
data['shape'] = np.array([new_h, new_w])
return data
def resize_image(self, img):
norm_img = np.zeros([1024, 1024, 3], dtype='float32')
scale = [512, 1024]
h, w = img.shape[:2]
max_long_edge = max(scale)
max_short_edge = min(scale)
scale_factor = min(max_long_edge / max(h, w),
max_short_edge / min(h, w))
resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float(
scale_factor) + 0.5)
max_stride = 32
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
im = cv2.resize(img, (resize_w, resize_h))
new_h, new_w = im.shape[:2]
w_scale = new_w / w
h_scale = new_h / h
scale_factor = np.array(
[w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
norm_img[:new_h, :new_w, :] = im
return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w]
def resize_boxes(self, im, points, scale_factor):
points = points * scale_factor
img_shape = im.shape[:2]
points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1])
points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0])
return points
class SRResize:
def __init__(self,
imgH=32,
imgW=128,
down_sample_scale=4,
keep_ratio=False,
min_ratio=1,
mask=False,
infer_mode=False,
**kwargs):
self.imgH = imgH
self.imgW = imgW
self.keep_ratio = keep_ratio
self.min_ratio = min_ratio
self.down_sample_scale = down_sample_scale
self.mask = mask
self.infer_mode = infer_mode
def __call__(self, data):
imgH = self.imgH
imgW = self.imgW
images_lr = data["image_lr"]
transform2 = ResizeNormalize(
(imgW // self.down_sample_scale, imgH // self.down_sample_scale))
images_lr = transform2(images_lr)
data["img_lr"] = images_lr
if self.infer_mode:
return data
images_HR = data["image_hr"]
_label_strs = data["label"]
transform = ResizeNormalize((imgW, imgH))
images_HR = transform(images_HR)
data["img_hr"] = images_HR
return data
class ResizeNormalize:
def __init__(self, size, interpolation=Image.BICUBIC):
self.size = size
self.interpolation = interpolation
def __call__(self, img):
img = img.resize(self.size, self.interpolation)
img_numpy = np.array(img).astype("float32")
img_numpy = img_numpy.transpose((2, 0, 1)) / 255
return img_numpy
class GrayImageChannelFormat:
"""
format gray scale image's channel: (3,h,w) -> (1,h,w)
Args:
inverse: inverse gray image
"""
def __init__(self, inverse=False, **kwargs):
self.inverse = inverse
def __call__(self, data):
img = data['image']
img_single_channel = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_expanded = np.expand_dims(img_single_channel, 0)
if self.inverse:
data['image'] = np.abs(img_expanded - 1)
else:
data['image'] = img_expanded
data['src_image'] = img
return data
class Permute:
"""permute image
Args:
to_bgr (bool): whether convert RGB to BGR
channel_first (bool): whether convert HWC to CHW
"""
def __init__(self, ):
super(Permute, self).__init__()
def __call__(self, im, im_info):
"""
Args:
im (np.ndarray): image (np.ndarray)
im_info (dict): info of image
Returns:
im (np.ndarray): processed image (np.ndarray)
im_info (dict): info of processed image
"""
im = im.transpose((2, 0, 1)).copy()
return im, im_info
class PadStride:
""" padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
Args:
stride (bool): model with FPN need image shape % stride == 0
"""
def __init__(self, stride=0):
self.coarsest_stride = stride
def __call__(self, im, im_info):
"""
Args:
im (np.ndarray): image (np.ndarray)
im_info (dict): info of image
Returns:
im (np.ndarray): processed image (np.ndarray)
im_info (dict): info of processed image
"""
coarsest_stride = self.coarsest_stride
if coarsest_stride <= 0:
return im, im_info
im_c, im_h, im_w = im.shape
pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
padding_im[:, :im_h, :im_w] = im
return padding_im, im_info
def decode_image(im_file, im_info):
"""read rgb image
Args:
im_file (str|np.ndarray): input can be image path or np.ndarray
im_info (dict): info of image
Returns:
im (np.ndarray): processed image (np.ndarray)
im_info (dict): info of processed image
"""
if isinstance(im_file, str):
with open(im_file, 'rb') as f:
im_read = f.read()
data = np.frombuffer(im_read, dtype='uint8')
im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
else:
im = im_file
im_info['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32)
return im, im_info
def preprocess(im, preprocess_ops):
# process image by preprocess_ops
im_info = {
'scale_factor': np.array(
[1., 1.], dtype=np.float32),
'im_shape': None,
}
im, im_info = decode_image(im, im_info)
for operator in preprocess_ops:
im, im_info = operator(im, im_info)
return im, im_info
def nms(bboxes, scores, iou_thresh):
import numpy as np
x1 = bboxes[:, 0]
y1 = bboxes[:, 1]
x2 = bboxes[:, 2]
y2 = bboxes[:, 3]
areas = (y2 - y1) * (x2 - x1)
indices = []
index = scores.argsort()[::-1]
while index.size > 0:
i = index[0]
indices.append(i)
x11 = np.maximum(x1[i], x1[index[1:]])
y11 = np.maximum(y1[i], y1[index[1:]])
x22 = np.minimum(x2[i], x2[index[1:]])
y22 = np.minimum(y2[i], y2[index[1:]])
w = np.maximum(0, x22 - x11 + 1)
h = np.maximum(0, y22 - y11 + 1)
overlaps = w * h
ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
idx = np.where(ious <= iou_thresh)[0]
index = index[idx + 1]
return indices

View File

@ -1,370 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import re
import numpy as np
import cv2
from shapely.geometry import Polygon
import pyclipper
def build_post_process(config, global_config=None):
support_dict = {'DBPostProcess': DBPostProcess, 'CTCLabelDecode': CTCLabelDecode}
config = copy.deepcopy(config)
module_name = config.pop('name')
if module_name == "None":
return
if global_config is not None:
config.update(global_config)
module_class = support_dict.get(module_name)
if module_class is None:
raise ValueError(
'post process only support {}'.format(list(support_dict)))
return module_class(**config)
class DBPostProcess:
"""
The post process for Differentiable Binarization (DB).
"""
def __init__(self,
thresh=0.3,
box_thresh=0.7,
max_candidates=1000,
unclip_ratio=2.0,
use_dilation=False,
score_mode="fast",
box_type='quad',
**kwargs):
self.thresh = thresh
self.box_thresh = box_thresh
self.max_candidates = max_candidates
self.unclip_ratio = unclip_ratio
self.min_size = 3
self.score_mode = score_mode
self.box_type = box_type
assert score_mode in [
"slow", "fast"
], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
self.dilation_kernel = None if not use_dilation else np.array(
[[1, 1], [1, 1]])
def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
'''
_bitmap: single map with shape (1, H, W),
whose values are binarized as {0, 1}
'''
bitmap = _bitmap
height, width = bitmap.shape
boxes = []
scores = []
contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8),
cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours[:self.max_candidates]:
epsilon = 0.002 * cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, epsilon, True)
points = approx.reshape((-1, 2))
if points.shape[0] < 4:
continue
score = self.box_score_fast(pred, points.reshape(-1, 2))
if self.box_thresh > score:
continue
if points.shape[0] > 2:
box = self.unclip(points, self.unclip_ratio)
if len(box) > 1:
continue
else:
continue
box = box.reshape(-1, 2)
_, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
if sside < self.min_size + 2:
continue
box = np.array(box)
box[:, 0] = np.clip(
np.round(box[:, 0] / width * dest_width), 0, dest_width)
box[:, 1] = np.clip(
np.round(box[:, 1] / height * dest_height), 0, dest_height)
boxes.append(box.tolist())
scores.append(score)
return boxes, scores
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
'''
_bitmap: single map with shape (1, H, W),
whose values are binarized as {0, 1}
'''
bitmap = _bitmap
height, width = bitmap.shape
outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
cv2.CHAIN_APPROX_SIMPLE)
if len(outs) == 3:
_img, contours, _ = outs[0], outs[1], outs[2]
elif len(outs) == 2:
contours, _ = outs[0], outs[1]
num_contours = min(len(contours), self.max_candidates)
boxes = []
scores = []
for index in range(num_contours):
contour = contours[index]
points, sside = self.get_mini_boxes(contour)
if sside < self.min_size:
continue
points = np.array(points)
if self.score_mode == "fast":
score = self.box_score_fast(pred, points.reshape(-1, 2))
else:
score = self.box_score_slow(pred, contour)
if self.box_thresh > score:
continue
box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2)
box, sside = self.get_mini_boxes(box)
if sside < self.min_size + 2:
continue
box = np.array(box)
box[:, 0] = np.clip(
np.round(box[:, 0] / width * dest_width), 0, dest_width)
box[:, 1] = np.clip(
np.round(box[:, 1] / height * dest_height), 0, dest_height)
boxes.append(box.astype("int32"))
scores.append(score)
return np.array(boxes, dtype="int32"), scores
def unclip(self, box, unclip_ratio):
poly = Polygon(box)
distance = poly.area * unclip_ratio / poly.length
offset = pyclipper.PyclipperOffset()
offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
expanded = np.array(offset.Execute(distance))
return expanded
def get_mini_boxes(self, contour):
bounding_box = cv2.minAreaRect(contour)
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
index_1, index_2, index_3, index_4 = 0, 1, 2, 3
if points[1][1] > points[0][1]:
index_1 = 0
index_4 = 1
else:
index_1 = 1
index_4 = 0
if points[3][1] > points[2][1]:
index_2 = 2
index_3 = 3
else:
index_2 = 3
index_3 = 2
box = [
points[index_1], points[index_2], points[index_3], points[index_4]
]
return box, min(bounding_box[1])
def box_score_fast(self, bitmap, _box):
'''
box_score_fast: use bbox mean score as the mean score
'''
h, w = bitmap.shape[:2]
box = _box.copy()
xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1)
xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1)
ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1)
ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
box[:, 0] = box[:, 0] - xmin
box[:, 1] = box[:, 1] - ymin
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1)
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
def box_score_slow(self, bitmap, contour):
'''
box_score_slow: use polyon mean score as the mean score
'''
h, w = bitmap.shape[:2]
contour = contour.copy()
contour = np.reshape(contour, (-1, 2))
xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
contour[:, 0] = contour[:, 0] - xmin
contour[:, 1] = contour[:, 1] - ymin
cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1)
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
def __call__(self, outs_dict, shape_list):
pred = outs_dict['maps']
if not isinstance(pred, np.ndarray):
pred = pred.numpy()
pred = pred[:, 0, :, :]
segmentation = pred > self.thresh
boxes_batch = []
for batch_index in range(pred.shape[0]):
src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
if self.dilation_kernel is not None:
mask = cv2.dilate(
np.array(segmentation[batch_index]).astype(np.uint8),
self.dilation_kernel)
else:
mask = segmentation[batch_index]
if self.box_type == 'poly':
boxes, scores = self.polygons_from_bitmap(pred[batch_index],
mask, src_w, src_h)
elif self.box_type == 'quad':
boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
src_w, src_h)
else:
raise ValueError(
"box_type can only be one of ['quad', 'poly']")
boxes_batch.append({'points': boxes})
return boxes_batch
class BaseRecLabelDecode:
""" Convert between text-label and text-index """
def __init__(self, character_dict_path=None, use_space_char=False):
self.beg_str = "sos"
self.end_str = "eos"
self.reverse = False
self.character_str = []
if character_dict_path is None:
self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
dict_character = list(self.character_str)
else:
with open(character_dict_path, "rb") as fin:
lines = fin.readlines()
for line in lines:
line = line.decode('utf-8').strip("\n").strip("\r\n")
self.character_str.append(line)
if use_space_char:
self.character_str.append(" ")
dict_character = list(self.character_str)
if 'arabic' in character_dict_path:
self.reverse = True
dict_character = self.add_special_char(dict_character)
self.dict = {}
for i, char in enumerate(dict_character):
self.dict[char] = i
self.character = dict_character
def pred_reverse(self, pred):
pred_re = []
c_current = ''
for c in pred:
if not bool(re.search('[a-zA-Z0-9 :*./%+-]', c)):
if c_current != '':
pred_re.append(c_current)
pred_re.append(c)
c_current = ''
else:
c_current += c
if c_current != '':
pred_re.append(c_current)
return ''.join(pred_re[::-1])
def add_special_char(self, dict_character):
return dict_character
def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
""" convert text-index into text-label. """
result_list = []
ignored_tokens = self.get_ignored_tokens()
batch_size = len(text_index)
for batch_idx in range(batch_size):
selection = np.ones(len(text_index[batch_idx]), dtype=bool)
if is_remove_duplicate:
selection[1:] = text_index[batch_idx][1:] != text_index[
batch_idx][:-1]
for ignored_token in ignored_tokens:
selection &= text_index[batch_idx] != ignored_token
char_list = [
self.character[text_id]
for text_id in text_index[batch_idx][selection]
]
if text_prob is not None:
conf_list = text_prob[batch_idx][selection]
else:
conf_list = [1] * len(selection)
if len(conf_list) == 0:
conf_list = [0]
text = ''.join(char_list)
if self.reverse: # for arabic rec
text = self.pred_reverse(text)
result_list.append((text, np.mean(conf_list).tolist()))
return result_list
def get_ignored_tokens(self):
return [0] # for ctc blank
class CTCLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """
def __init__(self, character_dict_path=None, use_space_char=False,
**kwargs):
super(CTCLabelDecode, self).__init__(character_dict_path,
use_space_char)
def __call__(self, preds, label=None, *args, **kwargs):
if isinstance(preds, tuple) or isinstance(preds, list):
preds = preds[-1]
if not isinstance(preds, np.ndarray):
preds = preds.numpy()
preds_idx = preds.argmax(axis=2)
preds_prob = preds.max(axis=2)
text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
if label is None:
return text
label = self.decode(label)
return text, label
def add_special_char(self, dict_character):
dict_character = ['blank'] + dict_character
return dict_character

View File

@ -1,435 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import os
import math
import numpy as np
import cv2
from functools import cmp_to_key
from api.utils.file_utils import get_project_base_directory
from .operators import * # noqa: F403
from .operators import preprocess
from . import operators
from .ocr import load_model
class Recognizer:
def __init__(self, label_list, task_name, model_dir=None):
"""
If you have trouble downloading HuggingFace models, -_^ this might help!!
For Linux:
export HF_ENDPOINT=https://hf-mirror.com
For Windows:
Good luck
^_-
"""
if not model_dir:
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc")
self.ort_sess, self.run_options = load_model(model_dir, task_name)
self.input_names = [node.name for node in self.ort_sess.get_inputs()]
self.output_names = [node.name for node in self.ort_sess.get_outputs()]
self.input_shape = self.ort_sess.get_inputs()[0].shape[2:4]
self.label_list = label_list
@staticmethod
def sort_Y_firstly(arr, threashold):
def cmp(c1, c2):
diff = c1["top"] - c2["top"]
if abs(diff) < threashold:
diff = c1["x0"] - c2["x0"]
return diff
arr = sorted(arr, key=cmp_to_key(cmp))
return arr
@staticmethod
def sort_X_firstly(arr, threashold):
def cmp(c1, c2):
diff = c1["x0"] - c2["x0"]
if abs(diff) < threashold:
diff = c1["top"] - c2["top"]
return diff
arr = sorted(arr, key=cmp_to_key(cmp))
return arr
@staticmethod
def sort_C_firstly(arr, thr=0):
# sort using y1 first and then x1
# sorted(arr, key=lambda r: (r["x0"], r["top"]))
arr = Recognizer.sort_X_firstly(arr, thr)
for i in range(len(arr) - 1):
for j in range(i, -1, -1):
# restore the order using th
if "C" not in arr[j] or "C" not in arr[j + 1]:
continue
if arr[j + 1]["C"] < arr[j]["C"] \
or (
arr[j + 1]["C"] == arr[j]["C"]
and arr[j + 1]["top"] < arr[j]["top"]
):
tmp = arr[j]
arr[j] = arr[j + 1]
arr[j + 1] = tmp
return arr
@staticmethod
def sort_R_firstly(arr, thr=0):
# sort using y1 first and then x1
# sorted(arr, key=lambda r: (r["top"], r["x0"]))
arr = Recognizer.sort_Y_firstly(arr, thr)
for i in range(len(arr) - 1):
for j in range(i, -1, -1):
if "R" not in arr[j] or "R" not in arr[j + 1]:
continue
if arr[j + 1]["R"] < arr[j]["R"] \
or (
arr[j + 1]["R"] == arr[j]["R"]
and arr[j + 1]["x0"] < arr[j]["x0"]
):
tmp = arr[j]
arr[j] = arr[j + 1]
arr[j + 1] = tmp
return arr
@staticmethod
def overlapped_area(a, b, ratio=True):
tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"]
if b["x0"] > x1 or b["x1"] < x0:
return 0
if b["bottom"] < tp or b["top"] > btm:
return 0
x0_ = max(b["x0"], x0)
x1_ = min(b["x1"], x1)
assert x0_ <= x1_, "Bbox mismatch! T:{},B:{},X0:{},X1:{} ==> {}".format(
tp, btm, x0, x1, b)
tp_ = max(b["top"], tp)
btm_ = min(b["bottom"], btm)
assert tp_ <= btm_, "Bbox mismatch! T:{},B:{},X0:{},X1:{} => {}".format(
tp, btm, x0, x1, b)
ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
x0 != 0 and btm - tp != 0 else 0
if ov > 0 and ratio:
ov /= (x1 - x0) * (btm - tp)
return ov
@staticmethod
def layouts_cleanup(boxes, layouts, far=2, thr=0.7):
def notOverlapped(a, b):
return any([a["x1"] < b["x0"],
a["x0"] > b["x1"],
a["bottom"] < b["top"],
a["top"] > b["bottom"]])
i = 0
while i + 1 < len(layouts):
j = i + 1
while j < min(i + far, len(layouts)) \
and (layouts[i].get("type", "") != layouts[j].get("type", "")
or notOverlapped(layouts[i], layouts[j])):
j += 1
if j >= min(i + far, len(layouts)):
i += 1
continue
if Recognizer.overlapped_area(layouts[i], layouts[j]) < thr \
and Recognizer.overlapped_area(layouts[j], layouts[i]) < thr:
i += 1
continue
if layouts[i].get("score") and layouts[j].get("score"):
if layouts[i]["score"] > layouts[j]["score"]:
layouts.pop(j)
else:
layouts.pop(i)
continue
area_i, area_i_1 = 0, 0
for b in boxes:
if not notOverlapped(b, layouts[i]):
area_i += Recognizer.overlapped_area(b, layouts[i], False)
if not notOverlapped(b, layouts[j]):
area_i_1 += Recognizer.overlapped_area(b, layouts[j], False)
if area_i > area_i_1:
layouts.pop(j)
else:
layouts.pop(i)
return layouts
def create_inputs(self, imgs, im_info):
"""generate input for different model type
Args:
imgs (list(numpy)): list of images (np.ndarray)
im_info (list(dict)): list of image info
Returns:
inputs (dict): input of model
"""
inputs = {}
im_shape = []
scale_factor = []
if len(imgs) == 1:
inputs['image'] = np.array((imgs[0],)).astype('float32')
inputs['im_shape'] = np.array(
(im_info[0]['im_shape'],)).astype('float32')
inputs['scale_factor'] = np.array(
(im_info[0]['scale_factor'],)).astype('float32')
return inputs
for e in im_info:
im_shape.append(np.array((e['im_shape'],)).astype('float32'))
scale_factor.append(np.array((e['scale_factor'],)).astype('float32'))
inputs['im_shape'] = np.concatenate(im_shape, axis=0)
inputs['scale_factor'] = np.concatenate(scale_factor, axis=0)
imgs_shape = [[e.shape[1], e.shape[2]] for e in imgs]
max_shape_h = max([e[0] for e in imgs_shape])
max_shape_w = max([e[1] for e in imgs_shape])
padding_imgs = []
for img in imgs:
im_c, im_h, im_w = img.shape[:]
padding_im = np.zeros(
(im_c, max_shape_h, max_shape_w), dtype=np.float32)
padding_im[:, :im_h, :im_w] = img
padding_imgs.append(padding_im)
inputs['image'] = np.stack(padding_imgs, axis=0)
return inputs
@staticmethod
def find_overlapped(box, boxes_sorted_by_y, naive=False):
if not boxes_sorted_by_y:
return
bxs = boxes_sorted_by_y
s, e, ii = 0, len(bxs), 0
while s < e and not naive:
ii = (e + s) // 2
pv = bxs[ii]
if box["bottom"] < pv["top"]:
e = ii
continue
if box["top"] > pv["bottom"]:
s = ii + 1
continue
break
while s < ii:
if box["top"] > bxs[s]["bottom"]:
s += 1
break
while e - 1 > ii:
if box["bottom"] < bxs[e - 1]["top"]:
e -= 1
break
max_overlaped_i, max_overlaped = None, 0
for i in range(s, e):
ov = Recognizer.overlapped_area(bxs[i], box)
if ov <= max_overlaped:
continue
max_overlaped_i = i
max_overlaped = ov
return max_overlaped_i
@staticmethod
def find_horizontally_tightest_fit(box, boxes):
if not boxes:
return
min_dis, min_i = 1000000, None
for i,b in enumerate(boxes):
if box.get("layoutno", "0") != b.get("layoutno", "0"):
continue
dis = min(abs(box["x0"] - b["x0"]), abs(box["x1"] - b["x1"]), abs(box["x0"]+box["x1"] - b["x1"] - b["x0"])/2)
if dis < min_dis:
min_i = i
min_dis = dis
return min_i
@staticmethod
def find_overlapped_with_threashold(box, boxes, thr=0.3):
if not boxes:
return
max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
s, e = 0, len(boxes)
for i in range(s, e):
ov = Recognizer.overlapped_area(box, boxes[i])
_ov = Recognizer.overlapped_area(boxes[i], box)
if (ov, _ov) < (max_overlapped, _max_overlapped):
continue
max_overlapped_i = i
max_overlapped = ov
_max_overlapped = _ov
return max_overlapped_i
def preprocess(self, image_list):
inputs = []
if "scale_factor" in self.input_names:
preprocess_ops = []
for op_info in [
{'interp': 2, 'keep_ratio': False, 'target_size': [800, 608], 'type': 'LinearResize'},
{'is_scale': True, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'type': 'StandardizeImage'},
{'type': 'Permute'},
{'stride': 32, 'type': 'PadStride'}
]:
new_op_info = op_info.copy()
op_type = new_op_info.pop('type')
preprocess_ops.append(getattr(operators, op_type)(**new_op_info))
for im_path in image_list:
im, im_info = preprocess(im_path, preprocess_ops)
inputs.append({"image": np.array((im,)).astype('float32'),
"scale_factor": np.array((im_info["scale_factor"],)).astype('float32')})
else:
hh, ww = self.input_shape
for img in image_list:
h, w = img.shape[:2]
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(np.array(img).astype('float32'), (ww, hh))
# Scale input pixel values to 0 to 1
img /= 255.0
img = img.transpose(2, 0, 1)
img = img[np.newaxis, :, :, :].astype(np.float32)
inputs.append({self.input_names[0]: img, "scale_factor": [w/ww, h/hh]})
return inputs
def postprocess(self, boxes, inputs, thr):
if "scale_factor" in self.input_names:
bb = []
for b in boxes:
clsid, bbox, score = int(b[0]), b[2:], b[1]
if score < thr:
continue
if clsid >= len(self.label_list):
continue
bb.append({
"type": self.label_list[clsid].lower(),
"bbox": [float(t) for t in bbox.tolist()],
"score": float(score)
})
return bb
def xywh2xyxy(x):
# [x, y, w, h] to [x1, y1, x2, y2]
y = np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2
y[:, 1] = x[:, 1] - x[:, 3] / 2
y[:, 2] = x[:, 0] + x[:, 2] / 2
y[:, 3] = x[:, 1] + x[:, 3] / 2
return y
def compute_iou(box, boxes):
# Compute xmin, ymin, xmax, ymax for both boxes
xmin = np.maximum(box[0], boxes[:, 0])
ymin = np.maximum(box[1], boxes[:, 1])
xmax = np.minimum(box[2], boxes[:, 2])
ymax = np.minimum(box[3], boxes[:, 3])
# Compute intersection area
intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
# Compute union area
box_area = (box[2] - box[0]) * (box[3] - box[1])
boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
union_area = box_area + boxes_area - intersection_area
# Compute IoU
iou = intersection_area / union_area
return iou
def iou_filter(boxes, scores, iou_threshold):
sorted_indices = np.argsort(scores)[::-1]
keep_boxes = []
while sorted_indices.size > 0:
# Pick the last box
box_id = sorted_indices[0]
keep_boxes.append(box_id)
# Compute IoU of the picked box with the rest
ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
# Remove boxes with IoU over the threshold
keep_indices = np.where(ious < iou_threshold)[0]
# print(keep_indices.shape, sorted_indices.shape)
sorted_indices = sorted_indices[keep_indices + 1]
return keep_boxes
boxes = np.squeeze(boxes).T
# Filter out object confidence scores below threshold
scores = np.max(boxes[:, 4:], axis=1)
boxes = boxes[scores > thr, :]
scores = scores[scores > thr]
if len(boxes) == 0:
return []
# Get the class with the highest confidence
class_ids = np.argmax(boxes[:, 4:], axis=1)
boxes = boxes[:, :4]
input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0], inputs["scale_factor"][1]])
boxes = np.multiply(boxes, input_shape, dtype=np.float32)
boxes = xywh2xyxy(boxes)
unique_class_ids = np.unique(class_ids)
indices = []
for class_id in unique_class_ids:
class_indices = np.where(class_ids == class_id)[0]
class_boxes = boxes[class_indices, :]
class_scores = scores[class_indices]
class_keep_boxes = iou_filter(class_boxes, class_scores, 0.2)
indices.extend(class_indices[class_keep_boxes])
return [{
"type": self.label_list[class_ids[i]].lower(),
"bbox": [float(t) for t in boxes[i].tolist()],
"score": float(scores[i])
} for i in indices]
def __call__(self, image_list, thr=0.7, batch_size=16):
res = []
imgs = []
for i in range(len(image_list)):
if not isinstance(image_list[i], np.ndarray):
imgs.append(np.array(image_list[i]))
else:
imgs.append(image_list[i])
batch_loop_cnt = math.ceil(float(len(imgs)) / batch_size)
for i in range(batch_loop_cnt):
start_index = i * batch_size
end_index = min((i + 1) * batch_size, len(imgs))
batch_image_list = imgs[start_index:end_index]
inputs = self.preprocess(batch_image_list)
logging.debug("preprocess")
for ins in inputs:
bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names}, self.run_options)[0], ins, thr)
res.append(bb)
#seeit.save_results(image_list, res, self.label_list, threshold=thr)
return res

View File

@ -1,87 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import os
import PIL
from PIL import ImageDraw
def save_results(image_list, results, labels, output_dir='output/', threshold=0.5):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for idx, im in enumerate(image_list):
im = draw_box(im, results[idx], labels, threshold=threshold)
out_path = os.path.join(output_dir, f"{idx}.jpg")
im.save(out_path, quality=95)
logging.debug("save result to: " + out_path)
def draw_box(im, result, lables, threshold=0.5):
draw_thickness = min(im.size) // 320
draw = ImageDraw.Draw(im)
color_list = get_color_map_list(len(lables))
clsid2color = {n.lower():color_list[i] for i,n in enumerate(lables)}
result = [r for r in result if r["score"] >= threshold]
for dt in result:
color = tuple(clsid2color[dt["type"]])
xmin, ymin, xmax, ymax = dt["bbox"]
draw.line(
[(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
(xmin, ymin)],
width=draw_thickness,
fill=color)
# draw label
text = "{} {:.4f}".format(dt["type"], dt["score"])
tw, th = imagedraw_textsize_c(draw, text)
draw.rectangle(
[(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
return im
def get_color_map_list(num_classes):
"""
Args:
num_classes (int): number of class
Returns:
color_map (list): RGB color list
"""
color_map = num_classes * [0, 0, 0]
for i in range(0, num_classes):
j = 0
lab = i
while lab:
color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
j += 1
lab >>= 3
color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
return color_map
def imagedraw_textsize_c(draw, text):
if int(PIL.__version__.split('.')[0]) < 10:
tw, th = draw.textsize(text)
else:
left, top, right, bottom = draw.textbbox((0, 0), text)
tw, th = right - left, bottom - top
return tw, th

View File

@ -1,59 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import sys
sys.path.insert(
0,
os.path.abspath(
os.path.join(
os.path.dirname(
os.path.abspath(__file__)),
'../../')))
from deepdoc.vision.seeit import draw_box
from deepdoc.vision import OCR, init_in_out
import argparse
import numpy as np
def main(args):
ocr = OCR()
images, outputs = init_in_out(args)
for i, img in enumerate(images):
bxs = ocr(np.array(img))
bxs = [(line[0], line[1][0]) for line in bxs]
bxs = [{
"text": t,
"bbox": [b[0][0], b[0][1], b[1][0], b[-1][1]],
"type": "ocr",
"score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
img = draw_box(images[i], bxs, ["ocr"], 1.)
img.save(outputs[i], quality=95)
with open(outputs[i] + ".txt", "w+", encoding='utf-8') as f:
f.write("\n".join([o["text"] for o in bxs]))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--inputs',
help="Directory where to store images or PDFs, or a file path to a single image or PDF",
required=True)
parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './ocr_outputs'",
default="./ocr_outputs")
args = parser.parse_args()
main(args)

View File

@ -1,186 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import os
import sys
sys.path.insert(
0,
os.path.abspath(
os.path.join(
os.path.dirname(
os.path.abspath(__file__)),
'../../')))
from deepdoc.vision.seeit import draw_box
from deepdoc.vision import LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
import argparse
import re
import numpy as np
def main(args):
images, outputs = init_in_out(args)
if args.mode.lower() == "layout":
detr = LayoutRecognizer("layout")
layouts = detr.forward(images, thr=float(args.threshold))
if args.mode.lower() == "tsr":
detr = TableStructureRecognizer()
ocr = OCR()
layouts = detr(images, thr=float(args.threshold))
for i, lyt in enumerate(layouts):
if args.mode.lower() == "tsr":
#lyt = [t for t in lyt if t["type"] == "table column"]
html = get_table_html(images[i], lyt, ocr)
with open(outputs[i] + ".html", "w+", encoding='utf-8') as f:
f.write(html)
lyt = [{
"type": t["label"],
"bbox": [t["x0"], t["top"], t["x1"], t["bottom"]],
"score": t["score"]
} for t in lyt]
img = draw_box(images[i], lyt, detr.labels, float(args.threshold))
img.save(outputs[i], quality=95)
logging.info("save result to: " + outputs[i])
def get_table_html(img, tb_cpns, ocr):
boxes = ocr(np.array(img))
boxes = LayoutRecognizer.sort_Y_firstly(
[{"x0": b[0][0], "x1": b[1][0],
"top": b[0][1], "text": t[0],
"bottom": b[-1][1],
"layout_type": "table",
"page_number": 0} for b, t in boxes if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
np.mean([b[-1][1] - b[0][1] for b, _ in boxes]) / 3
)
def gather(kwd, fzy=10, ption=0.6):
nonlocal boxes
eles = LayoutRecognizer.sort_Y_firstly(
[r for r in tb_cpns if re.match(kwd, r["label"])], fzy)
eles = LayoutRecognizer.layouts_cleanup(boxes, eles, 5, ption)
return LayoutRecognizer.sort_Y_firstly(eles, 0)
headers = gather(r".*header$")
rows = gather(r".* (row|header)")
spans = gather(r".*spanning")
clmns = sorted([r for r in tb_cpns if re.match(
r"table column$", r["label"])], key=lambda x: x["x0"])
clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5)
for b in boxes:
ii = LayoutRecognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
if ii is not None:
b["R"] = ii
b["R_top"] = rows[ii]["top"]
b["R_bott"] = rows[ii]["bottom"]
ii = LayoutRecognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
if ii is not None:
b["H_top"] = headers[ii]["top"]
b["H_bott"] = headers[ii]["bottom"]
b["H_left"] = headers[ii]["x0"]
b["H_right"] = headers[ii]["x1"]
b["H"] = ii
ii = LayoutRecognizer.find_horizontally_tightest_fit(b, clmns)
if ii is not None:
b["C"] = ii
b["C_left"] = clmns[ii]["x0"]
b["C_right"] = clmns[ii]["x1"]
ii = LayoutRecognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
if ii is not None:
b["H_top"] = spans[ii]["top"]
b["H_bott"] = spans[ii]["bottom"]
b["H_left"] = spans[ii]["x0"]
b["H_right"] = spans[ii]["x1"]
b["SP"] = ii
html = """
<html>
<head>
<style>
._table_1nkzy_11 {
margin: auto;
width: 70%%;
padding: 10px;
}
._table_1nkzy_11 p {
margin-bottom: 50px;
border: 1px solid #e1e1e1;
}
caption {
color: #6ac1ca;
font-size: 20px;
height: 50px;
line-height: 50px;
font-weight: 600;
margin-bottom: 10px;
}
._table_1nkzy_11 table {
width: 100%%;
border-collapse: collapse;
}
th {
color: #fff;
background-color: #6ac1ca;
}
td:hover {
background: #c1e8e8;
}
tr:nth-child(even) {
background-color: #f2f2f2;
}
._table_1nkzy_11 th,
._table_1nkzy_11 td {
text-align: center;
border: 1px solid #ddd;
padding: 8px;
}
</style>
</head>
<body>
%s
</body>
</html>
""" % TableStructureRecognizer.construct_table(boxes, html=True)
return html
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--inputs',
help="Directory where to store images or PDFs, or a file path to a single image or PDF",
required=True)
parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './layouts_outputs'",
default="./layouts_outputs")
parser.add_argument(
'--threshold',
help="A threshold to filter out detections. Default: 0.5",
default=0.5)
parser.add_argument('--mode', help="Task mode: layout recognition or table structure recognition", choices=["layout", "tsr"],
default="layout")
args = parser.parse_args()
main(args)

View File

@ -1,587 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import os
import re
from collections import Counter
import numpy as np
from huggingface_hub import snapshot_download
from api.utils.file_utils import get_project_base_directory
from rag.nlp import rag_tokenizer
from .recognizer import Recognizer
class TableStructureRecognizer(Recognizer):
labels = [
"table",
"table column",
"table row",
"table column header",
"table projected row header",
"table spanning cell",
]
def __init__(self):
try:
super().__init__(self.labels, "tsr", os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"))
except Exception:
super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
local_dir_use_symlinks=False))
def __call__(self, images, thr=0.2):
tbls = super().__call__(images, thr)
res = []
# align left&right for rows, align top&bottom for columns
for tbl in tbls:
lts = [{"label": b["type"],
"score": b["score"],
"x0": b["bbox"][0], "x1": b["bbox"][2],
"top": b["bbox"][1], "bottom": b["bbox"][-1]
} for b in tbl]
if not lts:
continue
left = [b["x0"] for b in lts if b["label"].find(
"row") > 0 or b["label"].find("header") > 0]
right = [b["x1"] for b in lts if b["label"].find(
"row") > 0 or b["label"].find("header") > 0]
if not left:
continue
left = np.mean(left) if len(left) > 4 else np.min(left)
right = np.mean(right) if len(right) > 4 else np.max(right)
for b in lts:
if b["label"].find("row") > 0 or b["label"].find("header") > 0:
if b["x0"] > left:
b["x0"] = left
if b["x1"] < right:
b["x1"] = right
top = [b["top"] for b in lts if b["label"] == "table column"]
bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
if not top:
res.append(lts)
continue
top = np.median(top) if len(top) > 4 else np.min(top)
bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
for b in lts:
if b["label"] == "table column":
if b["top"] > top:
b["top"] = top
if b["bottom"] < bottom:
b["bottom"] = bottom
res.append(lts)
return res
@staticmethod
def is_caption(bx):
patt = [
r"[图表]+[ 0-9:]{2,}"
]
if any([re.match(p, bx["text"].strip()) for p in patt]) \
or bx["layout_type"].find("caption") >= 0:
return True
return False
@staticmethod
def blockType(b):
patt = [
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
(r"^(20|19)[0-9]{2}年$", "Dt"),
(r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
(r"^第*[一二三四1-4]季度$", "Dt"),
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
(r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
("^[0-9.,+%/ -]+$", "Nu"),
(r"^[0-9A-Z/\._~-]+$", "Ca"),
(r"^[A-Z]*[a-z' -]+$", "En"),
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()' -]+$", "NE"),
(r"^.{1}$", "Sg")
]
for p, n in patt:
if re.search(p, b["text"].strip()):
return n
tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1]
if len(tks) > 3:
if len(tks) < 12:
return "Tx"
else:
return "Lx"
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
return "Nr"
return "Ot"
@staticmethod
def construct_table(boxes, is_english=False, html=False):
cap = ""
i = 0
while i < len(boxes):
if TableStructureRecognizer.is_caption(boxes[i]):
if is_english:
cap + " "
cap += boxes[i]["text"]
boxes.pop(i)
i -= 1
i += 1
if not boxes:
return []
for b in boxes:
b["btype"] = TableStructureRecognizer.blockType(b)
max_type = Counter([b["btype"] for b in boxes]).items()
max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
logging.debug("MAXTYPE: " + max_type)
rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
rowh = np.min(rowh) if rowh else 0
boxes = Recognizer.sort_R_firstly(boxes, rowh / 2)
#for b in boxes:print(b)
boxes[0]["rn"] = 0
rows = [[boxes[0]]]
btm = boxes[0]["bottom"]
for b in boxes[1:]:
b["rn"] = len(rows) - 1
lst_r = rows[-1]
if lst_r[-1].get("R", "") != b.get("R", "") \
or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
): # new row
btm = b["bottom"]
b["rn"] += 1
rows.append([b])
continue
btm = (btm + b["bottom"]) / 2.
rows[-1].append(b)
colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
colwm = np.min(colwm) if colwm else 0
crosspage = len(set([b["page_number"] for b in boxes])) > 1
if crosspage:
boxes = Recognizer.sort_X_firstly(boxes, colwm / 2)
else:
boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
boxes[0]["cn"] = 0
cols = [[boxes[0]]]
right = boxes[0]["x1"]
for b in boxes[1:]:
b["cn"] = len(cols) - 1
lst_c = cols[-1]
if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
"page_number"]) \
or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")): # new col
right = b["x1"]
b["cn"] += 1
cols.append([b])
continue
right = (right + b["x1"]) / 2.
cols[-1].append(b)
tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
for b in boxes:
tbl[b["rn"]][b["cn"]].append(b)
if len(rows) >= 4:
# remove single in column
j = 0
while j < len(tbl[0]):
e, ii = 0, 0
for i in range(len(tbl)):
if tbl[i][j]:
e += 1
ii = i
if e > 1:
break
if e > 1:
j += 1
continue
f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
[j - 1][0].get("text")) or j == 0
ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
[j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
if f and ff:
j += 1
continue
bx = tbl[ii][j][0]
logging.debug("Relocate column single: " + bx["text"])
# j column only has one value
left, right = 100000, 100000
if j > 0 and not f:
for i in range(len(tbl)):
if tbl[i][j - 1]:
left = min(left, np.min(
[bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
if j + 1 < len(tbl[0]) and not ff:
for i in range(len(tbl)):
if tbl[i][j + 1]:
right = min(right, np.min(
[a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
assert left < 100000 or right < 100000
if left < right:
for jj in range(j, len(tbl[0])):
for i in range(len(tbl)):
for a in tbl[i][jj]:
a["cn"] -= 1
if tbl[ii][j - 1]:
tbl[ii][j - 1].extend(tbl[ii][j])
else:
tbl[ii][j - 1] = tbl[ii][j]
for i in range(len(tbl)):
tbl[i].pop(j)
else:
for jj in range(j + 1, len(tbl[0])):
for i in range(len(tbl)):
for a in tbl[i][jj]:
a["cn"] -= 1
if tbl[ii][j + 1]:
tbl[ii][j + 1].extend(tbl[ii][j])
else:
tbl[ii][j + 1] = tbl[ii][j]
for i in range(len(tbl)):
tbl[i].pop(j)
cols.pop(j)
assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
len(cols), len(tbl[0]))
if len(cols) >= 4:
# remove single in row
i = 0
while i < len(tbl):
e, jj = 0, 0
for j in range(len(tbl[i])):
if tbl[i][j]:
e += 1
jj = j
if e > 1:
break
if e > 1:
i += 1
continue
f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
[jj][0].get("text")) or i == 0
ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
[jj][0].get("text")) or i + 1 >= len(tbl)
if f and ff:
i += 1
continue
bx = tbl[i][jj][0]
logging.debug("Relocate row single: " + bx["text"])
# i row only has one value
up, down = 100000, 100000
if i > 0 and not f:
for j in range(len(tbl[i - 1])):
if tbl[i - 1][j]:
up = min(up, np.min(
[bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
if i + 1 < len(tbl) and not ff:
for j in range(len(tbl[i + 1])):
if tbl[i + 1][j]:
down = min(down, np.min(
[a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
assert up < 100000 or down < 100000
if up < down:
for ii in range(i, len(tbl)):
for j in range(len(tbl[ii])):
for a in tbl[ii][j]:
a["rn"] -= 1
if tbl[i - 1][jj]:
tbl[i - 1][jj].extend(tbl[i][jj])
else:
tbl[i - 1][jj] = tbl[i][jj]
tbl.pop(i)
else:
for ii in range(i + 1, len(tbl)):
for j in range(len(tbl[ii])):
for a in tbl[ii][j]:
a["rn"] -= 1
if tbl[i + 1][jj]:
tbl[i + 1][jj].extend(tbl[i][jj])
else:
tbl[i + 1][jj] = tbl[i][jj]
tbl.pop(i)
rows.pop(i)
# which rows are headers
hdset = set([])
for i in range(len(tbl)):
cnt, h = 0, 0
for j, arr in enumerate(tbl[i]):
if not arr:
continue
cnt += 1
if max_type == "Nu" and arr[0]["btype"] == "Nu":
continue
if any([a.get("H") for a in arr]) \
or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
h += 1
if h / cnt > 0.5:
hdset.add(i)
if html:
return TableStructureRecognizer.__html_table(cap, hdset,
TableStructureRecognizer.__cal_spans(boxes, rows,
cols, tbl, True)
)
return TableStructureRecognizer.__desc_table(cap, hdset,
TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl,
False),
is_english)
@staticmethod
def __html_table(cap, hdset, tbl):
# constrcut HTML
html = "<table>"
if cap:
html += f"<caption>{cap}</caption>"
for i in range(len(tbl)):
row = "<tr>"
txts = []
for j, arr in enumerate(tbl[i]):
if arr is None:
continue
if not arr:
row += "<td></td>" if i not in hdset else "<th></th>"
continue
txt = ""
if arr:
h = min(np.min([c["bottom"] - c["top"]
for c in arr]) / 2, 10)
txt = " ".join([c["text"]
for c in Recognizer.sort_Y_firstly(arr, h)])
txts.append(txt)
sp = ""
if arr[0].get("colspan"):
sp = "colspan={}".format(arr[0]["colspan"])
if arr[0].get("rowspan"):
sp += " rowspan={}".format(arr[0]["rowspan"])
if i in hdset:
row += f"<th {sp} >" + txt + "</th>"
else:
row += f"<td {sp} >" + txt + "</td>"
if i in hdset:
if all([t in hdset for t in txts]):
continue
for t in txts:
hdset.add(t)
if row != "<tr>":
row += "</tr>"
else:
row = ""
html += "\n" + row
html += "\n</table>"
return html
@staticmethod
def __desc_table(cap, hdr_rowno, tbl, is_english):
# get text of every colomn in header row to become header text
clmno = len(tbl[0])
rowno = len(tbl)
headers = {}
hdrset = set()
lst_hdr = []
de = "" if not is_english else " for "
for r in sorted(list(hdr_rowno)):
headers[r] = ["" for _ in range(clmno)]
for i in range(clmno):
if not tbl[r][i]:
continue
txt = " ".join([a["text"].strip() for a in tbl[r][i]])
headers[r][i] = txt
hdrset.add(txt)
if all([not t for t in headers[r]]):
del headers[r]
hdr_rowno.remove(r)
continue
for j in range(clmno):
if headers[r][j]:
continue
if j >= len(lst_hdr):
break
headers[r][j] = lst_hdr[j]
lst_hdr = headers[r]
for i in range(rowno):
if i not in hdr_rowno:
continue
for j in range(i + 1, rowno):
if j not in hdr_rowno:
break
for k in range(clmno):
if not headers[j - 1][k]:
continue
if headers[j][k].find(headers[j - 1][k]) >= 0:
continue
if len(headers[j][k]) > len(headers[j - 1][k]):
headers[j][k] += (de if headers[j][k]
else "") + headers[j - 1][k]
else:
headers[j][k] = headers[j - 1][k] \
+ (de if headers[j - 1][k] else "") \
+ headers[j][k]
logging.debug(
f">>>>>>>>>>>>>>>>>{cap}SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
row_txt = []
for i in range(rowno):
if i in hdr_rowno:
continue
rtxt = []
def append(delimer):
nonlocal rtxt, row_txt
rtxt = delimer.join(rtxt)
if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
row_txt[-1] += "\n" + rtxt
else:
row_txt.append(rtxt)
r = 0
if len(headers.items()):
_arr = [(i - r, r) for r, _ in headers.items() if r < i]
if _arr:
_, r = min(_arr, key=lambda x: x[0])
if r not in headers and clmno <= 2:
for j in range(clmno):
if not tbl[i][j]:
continue
txt = "".join([a["text"].strip() for a in tbl[i][j]])
if txt:
rtxt.append(txt)
if rtxt:
append("")
continue
for j in range(clmno):
if not tbl[i][j]:
continue
txt = "".join([a["text"].strip() for a in tbl[i][j]])
if not txt:
continue
ctt = headers[r][j] if r in headers else ""
if ctt:
ctt += ""
ctt += txt
if ctt:
rtxt.append(ctt)
if rtxt:
row_txt.append("; ".join(rtxt))
if cap:
if is_english:
from_ = " in "
else:
from_ = "来自"
row_txt = [t + f"\t——{from_}{cap}" for t in row_txt]
return row_txt
@staticmethod
def __cal_spans(boxes, rows, cols, tbl, html=True):
# caculate span
clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
for cln in cols]
crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
for cln in cols]
rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
for row in rows]
rbtm = [np.mean([c.get("R_btm", c["bottom"])
for c in row]) for row in rows]
for b in boxes:
if "SP" not in b:
continue
b["colspan"] = [b["cn"]]
b["rowspan"] = [b["rn"]]
# col span
for j in range(0, len(clft)):
if j == b["cn"]:
continue
if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
continue
if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
continue
b["colspan"].append(j)
# row span
for j in range(0, len(rtop)):
if j == b["rn"]:
continue
if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
continue
if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
continue
b["rowspan"].append(j)
def join(arr):
if not arr:
return ""
return "".join([t["text"] for t in arr])
# rm the spaning cells
for i in range(len(tbl)):
for j, arr in enumerate(tbl[i]):
if not arr:
continue
if all(["rowspan" not in a and "colspan" not in a for a in arr]):
continue
rowspan, colspan = [], []
for a in arr:
if isinstance(a.get("rowspan", 0), list):
rowspan.extend(a["rowspan"])
if isinstance(a.get("colspan", 0), list):
colspan.extend(a["colspan"])
rowspan, colspan = set(rowspan), set(colspan)
if len(rowspan) < 2 and len(colspan) < 2:
for a in arr:
if "rowspan" in a:
del a["rowspan"]
if "colspan" in a:
del a["colspan"]
continue
rowspan, colspan = sorted(rowspan), sorted(colspan)
rowspan = list(range(rowspan[0], rowspan[-1] + 1))
colspan = list(range(colspan[0], colspan[-1] + 1))
assert i in rowspan, rowspan
assert j in colspan, colspan
arr = []
for r in rowspan:
for c in colspan:
arr_txt = join(arr)
if tbl[r][c] and join(tbl[r][c]) != arr_txt:
arr.extend(tbl[r][c])
tbl[r][c] = None if html else arr
for a in arr:
if len(rowspan) > 1:
a["rowspan"] = len(rowspan)
elif "rowspan" in a:
del a["rowspan"]
if len(colspan) > 1:
a["colspan"] = len(colspan)
elif "colspan" in a:
del a["colspan"]
tbl[rowspan[0]][colspan[0]] = arr
return tbl

View File

@ -9,18 +9,18 @@ declare module 'vue' {
export interface GlobalComponents {
SvgIcon: import("vue").DefineComponent<{
name: {
type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management">;
type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management">;
default: string;
required: true;
};
}, {}, unknown, {}, {}, import("vue").ComponentOptionsMixin, import("vue").ComponentOptionsMixin, {}, string, import("vue").VNodeProps & import("vue").AllowedComponentProps & import("vue").ComponentCustomProps, Readonly<import("vue").ExtractPropTypes<{
name: {
type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management">;
type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management">;
default: string;
required: true;
};
}>>, {
name: "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management";
name: "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management";
}>;
}
}

View File

@ -7,20 +7,20 @@
declare module '~virtual/svg-component' {
const SvgIcon: import("vue").DefineComponent<{
name: {
type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management">;
type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management">;
default: string;
required: true;
};
}, {}, unknown, {}, {}, import("vue").ComponentOptionsMixin, import("vue").ComponentOptionsMixin, {}, string, import("vue").VNodeProps & import("vue").AllowedComponentProps & import("vue").ComponentCustomProps, Readonly<import("vue").ExtractPropTypes<{
name: {
type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management">;
type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management">;
default: string;
required: true;
};
}>>, {
name: "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management";
name: "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management";
}>;
export const svgNames: ["conversation", "dashboard", "file", "fullscreen-exit", "fullscreen", "kb", "keyboard-down", "keyboard-enter", "keyboard-esc", "keyboard-up", "search", "team-management", "user-config", "user-management"];
export type SvgName = "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management";
export const svgNames: ["conversation", "dashboard", "file", "fullscreen-exit", "fullscreen", "kb", "keyboard-down", "keyboard-enter", "keyboard-esc", "keyboard-up", "search", "storage", "team-management", "user-config", "user-management"];
export type SvgName = "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management";
export default SvgIcon;
}

View File

@ -1,15 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View File

@ -1,44 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
from api.db import LLMType
from rag.nlp import rag_tokenizer
from api.db.services.llm_service import LLMBundle
from rag.nlp import tokenize
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English
eng = lang.lower() == "english" # is_english(sections)
try:
callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
ans = seq2txt_mdl.transcription(binary)
callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
tokenize(doc, ans, eng)
return [doc]
except Exception as e:
callback(prog=-1, msg=str(e))
return []

View File

@ -1,157 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
import re
from io import BytesIO
from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts: {}".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
self._filter_forpages()
self._merge_with_same_bullet()
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
"""
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None
sections, tbls = [], []
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
doc_parser = DocxParser()
# TODO: table of contents need to be removed
sections, tbls = doc_parser(
binary if binary else filename, from_page=from_page, to_page=to_page)
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
tbls = [((None, lns), None) for lns in tbls]
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
make_colon_as_title(sections)
bull = bullets_category(
[t for t in random_choices([t for t, _ in sections], k=100)])
if bull >= 0:
chunks = ["\n".join(ck)
for ck in hierarchical_merge(bull, sections, 5)]
else:
sections = [s.split("@") for s, _ in sections]
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ]
chunks = naive_merge(
sections, kwargs.get(
"chunk_token_num", 256), kwargs.get(
"delimer", "\n。;!?"))
# is it English
# is_english(random_choices([t for t, _ in sections], k=218))
eng = lang.lower() == "english"
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)

View File

@ -1,117 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from email import policy
from email.parser import BytesParser
from rag.app.naive import chunk as naive_chunk
import re
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
from deepdoc.parser import HtmlParser, TxtParser
from timeit import default_timer as timer
import io
def chunk(
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
**kwargs,
):
"""
Only eml is supported
"""
eng = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config",
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"},
)
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
main_res = []
attachment_res = []
if binary:
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
else:
msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
text_txt, html_txt = [], []
# get the email header info
for header, value in msg.items():
text_txt.append(f"{header}: {value}")
# get the email main info
def _add_content(msg, content_type):
if content_type == "text/plain":
text_txt.append(
msg.get_payload(decode=True).decode(msg.get_content_charset())
)
elif content_type == "text/html":
html_txt.append(
msg.get_payload(decode=True).decode(msg.get_content_charset())
)
elif "multipart" in content_type:
if msg.is_multipart():
for part in msg.iter_parts():
_add_content(part, part.get_content_type())
_add_content(msg, msg.get_content_type())
sections = TxtParser.parser_txt("\n".join(text_txt)) + [
(line, "") for line in HtmlParser.parser_txt("\n".join(html_txt)) if line
]
st = timer()
chunks = naive_merge(
sections,
int(parser_config.get("chunk_token_num", 128)),
parser_config.get("delimiter", "\n!?。;!?"),
)
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
logging.debug("naive_merge({}): {}".format(filename, timer() - st))
# get the attachment info
for part in msg.iter_attachments():
content_disposition = part.get("Content-Disposition")
if content_disposition:
dispositions = content_disposition.strip().split(";")
if dispositions[0].lower() == "attachment":
filename = part.get_filename()
payload = part.get_payload(decode=True)
try:
attachment_res.extend(
naive_chunk(filename, payload, callback=callback, **kwargs)
)
except Exception:
pass
return main_res + attachment_res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -1,216 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
import re
from io import BytesIO
from docx import Document
from api.db import ParserType
from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge, \
make_colon_as_title, tokenize_chunks, docx_question_level
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
class Docx(DocxParser):
def __init__(self):
pass
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
def old_call(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
for p in self.doc.paragraphs:
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
lines.append(self.__clean(p.text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
return [line for line in lines if line]
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
bull = bullets_category([p.text for p in self.doc.paragraphs])
for p in self.doc.paragraphs:
if pn > to_page:
break
question_level, p_text = docx_question_level(p, bull)
if not p_text.strip("\n"):
continue
lines.append((question_level, p_text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
visit = [False for _ in range(len(lines))]
sections = []
for s in range(len(lines)):
e = s + 1
while e < len(lines):
if lines[e][0] <= lines[s][0]:
break
e += 1
if e - s == 1 and visit[s]:
continue
sec = []
next_level = lines[s][0] + 1
while not sec and next_level < 22:
for i in range(s+1, e):
if lines[i][0] != next_level:
continue
sec.append(lines[i][1])
visit[i] = True
next_level += 1
sec.insert(0, lines[s][1])
sections.append("\n".join(sec))
return [s for s in sections if s]
def __str__(self) -> str:
return f'''
question:{self.question},
answer:{self.answer},
level:{self.level},
childs:{self.childs}
'''
class Pdf(PdfParser):
def __init__(self):
self.model_speciess = ParserType.LAWS.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts:".format(
))
self._naive_vertical_merge()
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], None
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
"""
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None
sections = []
# is it English
eng = lang.lower() == "english" # is_english(sections)
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunks = Docx()(filename, binary)
callback(0.7, "Finish parsing.")
return tokenize_chunks(chunks, doc, eng, None)
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
for txt, poss in pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)[0]:
sections.append(txt + poss)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
# Remove 'Contents' part
remove_contents_table(sections, eng)
make_colon_as_title(sections)
bull = bullets_category(sections)
chunks = hierarchical_merge(bull, sections, 5)
if not chunks:
callback(0.99, "No chunk parsed out.")
return tokenize_chunks(["\n".join(ck)
for ck in chunks], doc, eng, pdf_parser)
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -1,282 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import copy
import re
from api.db import ParserType
from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from rag.utils import num_tokens_from_string
from deepdoc.parser import PdfParser, PlainParser, DocxParser
from docx import Document
from PIL import Image
class Pdf(PdfParser):
def __init__(self):
self.model_speciess = ParserType.MANUAL.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
# for bb in self.boxes:
# for b in bb:
# print(b)
logging.debug("OCR: {}".format(timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts: {}".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()
self._filter_forpages()
callback(0.68, "Text merged ({:.2f}s)".format(timer() - start))
# clean mess
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)], tbls
class Docx(DocxParser):
def __init__(self):
pass
def get_picture(self, document, paragraph):
img = paragraph._element.xpath('.//pic:pic')
if not img:
return None
img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part = document.part.related_parts[embed]
image = related_part.image
image = Image.open(BytesIO(image.blob))
return image
def concat_img(self, img1, img2):
if img1 and not img2:
return img1
if not img1 and img2:
return img2
if not img1 and not img2:
return None
width1, height1 = img1.size
width2, height2 = img2.size
new_width = max(width1, width2)
new_height = height1 + height2
new_image = Image.new('RGB', (new_width, new_height))
new_image.paste(img1, (0, 0))
new_image.paste(img2, (0, height1))
return new_image
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
last_answer, last_image = "", None
question_stack, level_stack = [], []
ti_list = []
for p in self.doc.paragraphs:
if pn > to_page:
break
question_level, p_text = 0, ''
if from_page <= pn < to_page and p.text.strip():
question_level, p_text = docx_question_level(p)
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{p_text}'
current_image = self.get_picture(self.doc, p)
last_image = self.concat_img(last_image, current_image)
else: # is a question
if last_answer or last_image:
sum_question = '\n'.join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
last_answer, last_image = '', None
i = question_level
while question_stack and i <= level_stack[-1]:
question_stack.pop()
level_stack.pop()
question_stack.append(p_text)
level_stack.append(question_level)
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
if last_answer:
sum_question = '\n'.join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
tbls = []
for tb in self.doc.tables:
html= "<table>"
for r in tb.rows:
html += "<tr>"
i = 0
while i < len(r.cells):
span = 1
c = r.cells[i]
for j in range(i+1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
i = j
i += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
html += "</tr>"
html += "</table>"
tbls.append(((None, html), ""))
return ti_list, tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
"""
pdf_parser = None
doc = {
"docnm_kwd": filename
}
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
if sections and len(sections[0]) < 3:
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
# set pivot using the most frequent type of title,
# then merge between 2 pivot
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
most_level = max(0, max_lvl - 1)
levels = []
for txt, _, _ in sections:
for t, lvl in pdf_parser.outlines:
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
tks_ = set([txt[i] + txt[i + 1]
for i in range(min(len(t), len(txt) - 1))])
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
levels.append(lvl)
break
else:
levels.append(max_lvl + 1)
else:
bull = bullets_category([txt for txt, _, _ in sections])
most_level, levels = title_frequency(
bull, [(txt, lvl) for txt, lvl, _ in sections])
assert len(sections) == len(levels)
sec_ids = []
sid = 0
for i, lvl in enumerate(levels):
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
# print(lvl, self.boxes[i]["text"], most_level, sid)
sections = [(txt, sec_ids[i], poss)
for i, (txt, _, poss) in enumerate(sections)]
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0], -1,
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
def tag(pn, left, right, top, bottom):
if pn + left + right + top + bottom == 0:
return ""
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(pn, left, right, top, bottom)
chunks = []
last_sid = -2
tk_cnt = 0
for txt, sec_id, poss in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1])):
poss = "\t".join([tag(*pos) for pos in poss])
if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
if chunks:
chunks[-1] += "\n" + txt + poss
tk_cnt += num_tokens_from_string(txt)
continue
chunks.append(txt + poss)
tk_cnt = num_tokens_from_string(txt)
if sec_id > -1:
last_sid = sec_id
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)
res = tokenize_table(tbls, doc, eng)
for text, image in ti_list:
d = copy.deepcopy(doc)
d['image'] = image
tokenize(d, text, eng)
res.append(d)
return res
else:
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -1,313 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
from io import BytesIO
from docx import Document
from timeit import default_timer as timer
import re
from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
from rag.utils import num_tokens_from_string
from PIL import Image
from functools import reduce
from markdown import markdown
from docx.image.exceptions import UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError
class Docx(DocxParser):
def __init__(self):
pass
def get_picture(self, document, paragraph):
img = paragraph._element.xpath('.//pic:pic')
if not img:
return None
img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part = document.part.related_parts[embed]
try:
image_blob = related_part.image.blob
except UnrecognizedImageError:
logging.info("Unrecognized image format. Skipping image.")
return None
except UnexpectedEndOfFileError:
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
return None
except InvalidImageStreamError:
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
return None
try:
image = Image.open(BytesIO(image_blob)).convert('RGB')
return image
except Exception:
return None
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
last_image = None
for p in self.doc.paragraphs:
if pn > to_page:
break
if from_page <= pn < to_page:
if p.text.strip():
if p.style and p.style.name == 'Caption':
former_image = None
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
former_image = lines[-1][1].pop()
elif last_image:
former_image = last_image
last_image = None
lines.append((self.__clean(p.text), [former_image], p.style.name))
else:
current_image = self.get_picture(self.doc, p)
image_list = [current_image]
if last_image:
image_list.insert(0, last_image)
last_image = None
lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
else:
if current_image := self.get_picture(self.doc, p):
if lines:
lines[-1][1].append(current_image)
else:
last_image = current_image
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
tbls = []
for tb in self.doc.tables:
html = "<table>"
for r in tb.rows:
html += "<tr>"
i = 0
while i < len(r.cells):
span = 1
c = r.cells[i]
for j in range(i + 1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
i = j
else:
break
i += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
html += "</tr>"
html += "</table>"
tbls.append(((None, html), ""))
return new_line, tbls
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
start = timer()
first_start = start
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
tbls = self._extract_table_figure(True, zoomin, True, True)
# self._naive_vertical_merge()
self._concat_downward()
# self._filter_forpages()
logging.info("layouts cost: {}s".format(timer() - first_start))
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], tbls
class Markdown(MarkdownParser):
def __call__(self, filename, binary=None):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
sections = []
tbls = []
for sec in remainder.split("\n"):
if num_tokens_from_string(sec) > 3 * self.chunk_token_num:
sections.append((sec[:int(len(sec) / 2)], ""))
sections.append((sec[int(len(sec) / 2):], ""))
else:
if sec.strip().find("#") == 0:
sections.append((sec, ""))
elif sections and sections[-1][0].strip().find("#") == 0:
sec_, _ = sections.pop(-1)
sections.append((sec_ + "\n" + sec, ""))
else:
sections.append((sec, ""))
for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
return sections, tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
pdf_parser = None
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tables = Docx()(filename, binary)
res = tokenize_table(tables, doc, is_english) # just for table
callback(0.8, "Finish parsing.")
st = timer()
chunks, images = naive_merge_docx(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks_docx(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
callback=callback)
res = tokenize_table(tables, doc, is_english)
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
if parser_config.get("html4excel"):
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
else:
sections = [(_, "") for _ in excel_parser(binary) if _]
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = TxtParser()(filename, binary,
parser_config.get("chunk_token_num", 128),
parser_config.get("delimiter", "\n!?;。;!?"))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
elif re.search(r"\.json$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
sections = JsonParser(chunk_token_num)(binary)
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
else:
callback(0.8, f"tika.parser got empty content from {filename}.")
logging.warning(f"tika.parser got empty content from {filename}.")
return []
else:
raise NotImplementedError(
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer()
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -1,139 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
from io import BytesIO
import re
from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
self._layouts_rec(zoomin, drop=False)
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts cost: {}s".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()
sections = [(b["text"], self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
One file forms a chunk which maintains original text order.
"""
eng = lang.lower() == "english" # is_english(cks)
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = naive.Docx()(filename, binary)
sections = [s for s, _ in sections if s]
for (_, html), _ in tbls:
sections.append(html)
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
sections, _ = pdf_parser(
filename if not binary else binary, to_page=to_page, callback=callback)
sections = [s for s, _ in sections if s]
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
sections = excel_parser.html(binary, 1000000000)
elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
tokenize(doc, "\n".join(sections), eng)
return [doc]
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -1,294 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import copy
import re
from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
import numpy as np
class Pdf(PdfParser):
def __init__(self):
self.model_speciess = ParserType.PAPER.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug(f"layouts cost: {timer() - start}s")
start = timer()
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
self._concat_downward()
self._filter_forpages()
callback(0.75, "Text merged ({:.2f}s)".format(timer() - start))
# clean mess
if column_width < self.page_images[0].size[0] / zoomin / 2:
logging.debug("two_column................... {} {}".format(column_width,
self.page_images[0].size[0] / zoomin / 2))
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
def _begin(txt):
return re.match(
"[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
txt.lower().strip())
if from_page > 0:
return {
"title": "",
"authors": "",
"abstract": "",
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
re.match(r"(text|title)", b.get("layoutno", "text"))],
"tables": tbls
}
# get title and authors
title = ""
authors = []
i = 0
while i < min(32, len(self.boxes)-1):
b = self.boxes[i]
i += 1
if b.get("layoutno", "").find("title") >= 0:
title = b["text"]
if _begin(title):
title = ""
break
for j in range(3):
if _begin(self.boxes[i + j]["text"]):
break
authors.append(self.boxes[i + j]["text"])
break
break
# get abstract
abstr = ""
i = 0
while i + 1 < min(32, len(self.boxes)):
b = self.boxes[i]
i += 1
txt = b["text"].lower().strip()
if re.match("(abstract|摘要)", txt):
if len(txt.split()) > 32 or len(txt) > 64:
abstr = txt + self._line_tag(b, zoomin)
break
txt = self.boxes[i]["text"].lower().strip()
if len(txt.split()) > 32 or len(txt) > 64:
abstr = txt + self._line_tag(self.boxes[i], zoomin)
i += 1
break
if not abstr:
i = 0
callback(
0.8, "Page {}~{}: Text merging finished".format(
from_page, min(
to_page, self.total_page)))
for b in self.boxes:
logging.debug("{} {}".format(b["text"], b.get("layoutno")))
logging.debug("{}".format(tbls))
return {
"title": title,
"authors": " ".join(authors),
"abstract": abstr,
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
re.match(r"(text|title)", b.get("layoutno", "text"))],
"tables": tbls
}
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
"""
if re.search(r"\.pdf$", filename, re.IGNORECASE):
if kwargs.get("parser_config", {}).get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
paper = {
"title": filename,
"authors": " ",
"abstract": "",
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
"tables": []
}
else:
pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
else:
raise NotImplementedError("file type not supported yet(pdf supported)")
doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
"title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
logging.debug("It's English.....{}".format(eng))
res = tokenize_table(paper["tables"], doc, eng)
if paper["abstract"]:
d = copy.deepcopy(doc)
txt = pdf_parser.remove_tag(paper["abstract"])
d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
d["important_tks"] = " ".join(d["important_kwd"])
d["image"], poss = pdf_parser.crop(
paper["abstract"], need_position=True)
add_positions(d, poss)
tokenize(d, txt, eng)
res.append(d)
sorted_sections = paper["sections"]
# set pivot using the most frequent type of title,
# then merge between 2 pivot
bull = bullets_category([txt for txt, _ in sorted_sections])
most_level, levels = title_frequency(bull, sorted_sections)
assert len(sorted_sections) == len(levels)
sec_ids = []
sid = 0
for i, lvl in enumerate(levels):
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
chunks = []
last_sid = -2
for (txt, _), sec_id in zip(sorted_sections, sec_ids):
if sec_id == last_sid:
if chunks:
chunks[-1] += "\n" + txt
continue
chunks.append(txt)
last_sid = sec_id
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
"""
readed = [0] * len(paper["lines"])
# find colon firstly
i = 0
while i + 1 < len(paper["lines"]):
txt = pdf_parser.remove_tag(paper["lines"][i][0])
j = i
if txt.strip("\n").strip()[-1] not in ":":
i += 1
continue
i += 1
while i < len(paper["lines"]) and not paper["lines"][i][0]:
i += 1
if i >= len(paper["lines"]): break
proj = [paper["lines"][i][0].strip()]
i += 1
while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
proj.append(paper["lines"][i])
i += 1
for k in range(j, i): readed[k] = True
txt = txt[::-1]
if eng:
r = re.search(r"(.*?) ([\\.;?!]|$)", txt)
txt = r.group(1)[::-1] if r else txt[::-1]
else:
r = re.search(r"(.*?) ([。?;!]|$)", txt)
txt = r.group(1)[::-1] if r else txt[::-1]
for p in proj:
d = copy.deepcopy(doc)
txt += "\n" + pdf_parser.remove_tag(p)
d["image"], poss = pdf_parser.crop(p, need_position=True)
add_positions(d, poss)
tokenize(d, txt, eng)
res.append(d)
i = 0
chunk = []
tk_cnt = 0
def add_chunk():
nonlocal chunk, res, doc, pdf_parser, tk_cnt
d = copy.deepcopy(doc)
ck = "\n".join(chunk)
tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss)
res.append(d)
chunk = []
tk_cnt = 0
while i < len(paper["lines"]):
if tk_cnt > 128:
add_chunk()
if readed[i]:
i += 1
continue
readed[i] = True
txt, layouts = paper["lines"][i]
txt_ = pdf_parser.remove_tag(txt)
i += 1
cnt = num_tokens_from_string(txt_)
if any([
layouts.find("title") >= 0 and chunk,
cnt + tk_cnt > 128 and tk_cnt > 32,
]):
add_chunk()
chunk = [txt]
tk_cnt = cnt
else:
chunk.append(txt)
tk_cnt += cnt
if chunk: add_chunk()
for i, d in enumerate(res):
print(d)
# d["image"].save(f"./logs/{i}.jpg")
return res
"""
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -1,59 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import io
import numpy as np
from PIL import Image
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from rag.nlp import tokenize
from deepdoc.vision import OCR
ocr = OCR()
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
img = Image.open(io.BytesIO(binary)).convert('RGB')
doc = {
"docnm_kwd": filename,
"image": img
}
bxs = ocr(np.array(img))
txt = "\n".join([t[0] for _, t in bxs if t[0]])
eng = lang.lower() == "english"
callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
if (eng and len(txt.split()) > 32) or len(txt) > 32:
tokenize(doc, txt, eng)
callback(0.8, "OCR results is too long to use CV LLM.")
return [doc]
try:
callback(0.4, "Use CV LLM to describe the picture.")
cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
img_binary = io.BytesIO()
img.save(img_binary, format='JPEG')
img_binary.seek(0)
ans = cv_mdl.describe(img_binary.read())
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans
tokenize(doc, txt, eng)
return [doc]
except Exception as e:
callback(prog=-1, msg=str(e))
return []

View File

@ -1,147 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import re
from io import BytesIO
from PIL import Image
from rag.nlp import tokenize, is_english
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read
class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None):
txts = super().__call__(fnm, from_page, to_page)
callback(0.5, "Text extraction finished.")
import aspose.slides as slides
import aspose.pydrawing as drawing
imgs = []
with slides.Presentation(BytesIO(fnm)) as presentation:
for i, slide in enumerate(presentation.slides[from_page: to_page]):
buffered = BytesIO()
slide.get_thumbnail(
0.5, 0.5).save(
buffered, drawing.imaging.ImageFormat.jpeg)
imgs.append(Image.open(buffered))
assert len(imgs) == len(
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
callback(0.9, "Image extraction finished")
self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))]
class Pdf(PdfParser):
def __init__(self):
super().__init__()
def __garbage(self, txt):
txt = txt.lower().strip()
if re.match(r"[0-9\.,%/-]+$", txt):
return True
if len(txt) < 3:
return True
return False
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(filename if not binary else binary,
zoomin, from_page, to_page, callback)
callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
len(self.boxes), len(self.page_images))
res = []
for i in range(len(self.boxes)):
lines = "\n".join([b["text"] for b in self.boxes[i]
if not self.__garbage(b["text"])])
res.append((lines, self.page_images[i]))
callback(0.9, "Page {}~{}: Parsing finished".format(
from_page, min(to_page, self.total_page)))
return res
class PlainPdf(PlainParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, callback=None, **kwargs):
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
page_txt = []
for page in self.pdf.pages[from_page: to_page]:
page_txt.append(page.extract_text())
callback(0.9, "Parsing finished")
return [(txt, None) for txt in page_txt]
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
The supported file formats are pdf, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
eng = lang.lower() == "english"
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt()
for pn, (txt, img) in enumerate(ppt_parser(
filename if not binary else binary, from_page, 1000000, callback)):
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
tokenize(d, txt, eng)
res.append(d)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
for pn, (txt, img) in enumerate(pdf_parser(filename, binary,
from_page=from_page, to_page=to_page, callback=callback)):
d = copy.deepcopy(doc)
pn += from_page
if img:
d["image"] = img
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
tokenize(d, txt, eng)
res.append(d)
return res
raise NotImplementedError(
"file type not supported yet(pptx, pdf supported)")
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -21,17 +21,30 @@ from copy import deepcopy
from io import BytesIO
from timeit import default_timer as timer
from openpyxl import load_workbook
from deepdoc.parser.utils import get_text
from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from docx import Document
from PIL import Image
from markdown import markdown
from rag.nlp import find_codec
class Excel(ExcelParser):
def get_text(fnm: str, binary=None) -> str:
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r") as f:
while True:
line = f.readline()
if not line:
break
txt += line
return txt
class Excel:
def __call__(self, fnm, binary=None, callback=None):
if not binary:
wb = load_workbook(fnm)
@ -61,31 +74,18 @@ class Excel(ExcelParser):
else:
fails.append(str(i + 1))
if len(res) % 999 == 0:
callback(len(res) *
0.6 /
total, ("Extract pairs: {}".format(len(res)) +
(f"{len(fails)} failure, line: %s..." %
(",".join(fails[:3])) if fails else "")))
callback(len(res) * 0.6 / total, ("Extract pairs: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
self.is_english = is_english(
[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
self.is_english = is_english([rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
return res
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
class Pdf:
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
start = timer()
@ -100,9 +100,9 @@ class Pdf(PdfParser):
self._text_merge()
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
tbls = self._extract_table_figure(True, zoomin, True, True)
#self._naive_vertical_merge()
# self._naive_vertical_merge()
# self._concat_downward()
#self._filter_forpages()
# self._filter_forpages()
logging.debug("layouts: {}".format(timer() - start))
sections = [b["text"] for b in self.boxes]
bull_x0_list = []
@ -110,57 +110,61 @@ class Pdf(PdfParser):
if q_bull == -1:
raise ValueError("Unable to recognize Q&A structure.")
qai_list = []
last_q, last_a, last_tag = '', '', ''
last_q, last_a, last_tag = "", "", ""
last_index = -1
last_box = {'text':''}
last_box = {"text": ""}
last_bull = None
def sort_key(element):
tbls_pn = element[1][0][0]
tbls_top = element[1][0][3]
return tbls_pn, tbls_top
tbls.sort(key=sort_key)
tbl_index = 0
last_pn, last_bottom = 0, 0
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, "@@0\t0\t0\t0\t0##", ""
for box in self.boxes:
section, line_tag = box['text'], self._line_tag(box, zoomin)
section, line_tag = box["text"], self._line_tag(box, zoomin)
has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
last_box, last_index, last_bull = box, index, has_bull
line_pn = float(line_tag.lstrip('@@').split('\t')[0])
line_top = float(line_tag.rstrip('##').split('\t')[3])
line_pn = float(line_tag.lstrip("@@").split("\t")[0])
line_top = float(line_tag.rstrip("##").split("\t")[3])
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
if not has_bull: # No question bullet
if not last_q:
if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top): # image passed
if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top): # image passed
tbl_index += 1
continue
else:
sum_tag = line_tag
sum_section = section
while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the middle of current answer
sum_tag = f'{tbl_tag}{sum_tag}'
sum_section = f'{tbl_text}{sum_section}'
while ((tbl_pn == last_pn and tbl_top >= last_bottom) or (tbl_pn > last_pn)) and (
(tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)
): # add image at the middle of current answer
sum_tag = f"{tbl_tag}{sum_tag}"
sum_section = f"{tbl_text}{sum_section}"
tbl_index += 1
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
last_a = f'{last_a}{sum_section}'
last_tag = f'{last_tag}{sum_tag}'
last_a = f"{last_a}{sum_section}"
last_tag = f"{last_tag}{sum_tag}"
else:
if last_q:
while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the end of last answer
last_tag = f'{last_tag}{tbl_tag}'
last_a = f'{last_a}{tbl_text}'
while ((tbl_pn == last_pn and tbl_top >= last_bottom) or (tbl_pn > last_pn)) and (
(tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)
): # add image at the end of last answer
last_tag = f"{last_tag}{tbl_tag}"
last_a = f"{last_a}{tbl_text}"
tbl_index += 1
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
image, poss = self.crop(last_tag, need_position=True)
qai_list.append((last_q, last_a, image, poss))
last_q, last_a, last_tag = '', '', ''
last_q, last_a, last_tag = "", "", ""
last_q = has_bull.group()
_, end = has_bull.span()
last_a = section[end:]
last_tag = line_tag
last_bottom = float(line_tag.rstrip('##').split('\t')[4])
last_bottom = float(line_tag.rstrip("##").split("\t")[4])
last_pn = line_pn
if last_q:
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
@ -168,36 +172,34 @@ class Pdf(PdfParser):
def get_tbls_info(self, tbls, tbl_index):
if tbl_index >= len(tbls):
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
tbl_pn = tbls[tbl_index][1][0][0]+1
return 1, 0, 0, 0, 0, "@@0\t0\t0\t0\t0##", ""
tbl_pn = tbls[tbl_index][1][0][0] + 1
tbl_left = tbls[tbl_index][1][0][1]
tbl_right = tbls[tbl_index][1][0][2]
tbl_top = tbls[tbl_index][1][0][3]
tbl_bottom = tbls[tbl_index][1][0][4]
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
_tbl_text = ''.join(tbls[tbl_index][0][1])
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
_tbl_text = "".join(tbls[tbl_index][0][1])
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, _tbl_text
class Docx(DocxParser):
class Docx:
def __init__(self):
pass
def get_picture(self, document, paragraph):
img = paragraph._element.xpath('.//pic:pic')
img = paragraph._element.xpath(".//pic:pic")
if not img:
return None
img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
embed = img.xpath(".//a:blip/@r:embed")[0]
related_part = document.part.related_parts[embed]
image = related_part.image
image = Image.open(BytesIO(image.blob)).convert('RGB')
image = Image.open(BytesIO(image.blob)).convert("RGB")
return image
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
last_answer, last_image = "", None
question_stack, level_stack = [], []
@ -205,19 +207,19 @@ class Docx(DocxParser):
for p in self.doc.paragraphs:
if pn > to_page:
break
question_level, p_text = 0, ''
question_level, p_text = 0, ""
if from_page <= pn < to_page and p.text.strip():
question_level, p_text = docx_question_level(p)
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{p_text}'
if not question_level or question_level > 6: # not a question
last_answer = f"{last_answer}\n{p_text}"
current_image = self.get_picture(self.doc, p)
last_image = concat_img(last_image, current_image)
else: # is a question
else: # is a question
if last_answer or last_image:
sum_question = '\n'.join(question_stack)
sum_question = "\n".join(question_stack)
if sum_question:
qai_list.append((sum_question, last_answer, last_image))
last_answer, last_image = '', None
last_answer, last_image = "", None
i = question_level
while question_stack and i <= level_stack[-1]:
@ -226,26 +228,26 @@ class Docx(DocxParser):
question_stack.append(p_text)
level_stack.append(question_level)
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
if "lastRenderedPageBreak" in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
if last_answer:
sum_question = '\n'.join(question_stack)
sum_question = "\n".join(question_stack)
if sum_question:
qai_list.append((sum_question, last_answer, last_image))
tbls = []
for tb in self.doc.tables:
html= "<table>"
html = "<table>"
for r in tb.rows:
html += "<tr>"
i = 0
while i < len(r.cells):
span = 1
c = r.cells[i]
for j in range(i+1, len(r.cells)):
for j in range(i + 1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
i = j
@ -258,15 +260,13 @@ class Docx(DocxParser):
def rmPrefix(txt):
return re.sub(
r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t: ]+", "", txt.strip(), flags=re.IGNORECASE)
return re.sub(r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t: ]+", "", txt.strip(), flags=re.IGNORECASE)
def beAdocPdf(d, q, a, eng, image, poss):
qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join(
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_with_weight"] = "\t".join([qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["image"] = image
@ -277,8 +277,7 @@ def beAdocPdf(d, q, a, eng, image, poss):
def beAdocDocx(d, q, a, eng, image, row_num=-1):
qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join(
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_with_weight"] = "\t".join([qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["image"] = image
@ -290,8 +289,7 @@ def beAdocDocx(d, q, a, eng, image, row_num=-1):
def beAdoc(d, q, a, eng, row_num=-1):
qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join(
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_with_weight"] = "\t".join([qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
if row_num >= 0:
@ -300,28 +298,25 @@ def beAdoc(d, q, a, eng, row_num=-1):
def mdQuestionLevel(s):
match = re.match(r'#*', s)
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
match = re.match(r"#*", s)
return (len(match.group(0)), s.lstrip("#").lstrip()) if match else (0, s)
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
If the file is in excel format, there should be 2 column question and answer without header.
And question column is ahead of answer column.
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
Excel and csv(txt) format files are supported.
If the file is in excel format, there should be 2 column question and answer without header.
And question column is ahead of answer column.
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
All the deformed lines will be ignored.
Every pair of Q&A will be treated as a chunk.
All the deformed lines will be ignored.
Every pair of Q&A will be treated as a chunk.
"""
eng = lang.lower() == "english"
res = []
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
@ -350,21 +345,19 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
if question:
answer += "\n" + lines[i]
else:
fails.append(str(i+1))
fails.append(str(i + 1))
elif len(arr) == 2:
if question and answer:
res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
question, answer = arr
i += 1
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
if question:
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(lines)))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
@ -390,21 +383,18 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
question, answer = row
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
if question:
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(list(reader))))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
pdf_parser = Pdf()
qai_list, tbls = pdf_parser(filename if not binary else binary,
from_page=0, to_page=10000, callback=callback)
qai_list, tbls = pdf_parser(filename if not binary else binary, from_page=0, to_page=10000, callback=callback)
for q, a, image, poss in qai_list:
res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
return res
@ -417,20 +407,20 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
question_stack, level_stack = [], []
code_block = False
for index, line in enumerate(lines):
if line.strip().startswith('```'):
if line.strip().startswith("```"):
code_block = not code_block
question_level, question = 0, ''
question_level, question = 0, ""
if not code_block:
question_level, question = mdQuestionLevel(line)
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{line}'
else: # is a question
if not question_level or question_level > 6: # not a question
last_answer = f"{last_answer}\n{line}"
else: # is a question
if last_answer.strip():
sum_question = '\n'.join(question_stack)
sum_question = "\n".join(question_stack)
if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
last_answer = ''
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=["markdown.extensions.tables"]), eng, index))
last_answer = ""
i = question_level
while question_stack and i <= level_stack[-1]:
@ -439,22 +429,20 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
question_stack.append(question)
level_stack.append(question_level)
if last_answer.strip():
sum_question = '\n'.join(question_stack)
sum_question = "\n".join(question_stack)
if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=["markdown.extensions.tables"]), eng, index))
return res
elif re.search(r"\.docx$", filename, re.IGNORECASE):
docx_parser = Docx()
qai_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)
qai_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback)
res = tokenize_table(tbls, doc, eng)
for i, (q, a, image) in enumerate(qai_list):
res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i))
return res
raise NotImplementedError(
"Excel, csv(txt), pdf, markdown and docx format files are supported.")
raise NotImplementedError("Excel, csv(txt), pdf, markdown and docx format files are supported.")
if __name__ == "__main__":
@ -462,4 +450,5 @@ if __name__ == "__main__":
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -16,56 +16,29 @@
import logging
import base64
import datetime
import json
import re
import pandas as pd
import requests
from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import rag_tokenizer
from deepdoc.parser.resume import refactor
from deepdoc.parser.resume import step_one, step_two
from rag.utils import rmSpace
forbidden_select_fields4resume = [
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
]
forbidden_select_fields4resume = ["name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"]
def remote_call(filename, binary):
q = {
"header": {
"uid": 1,
"user": "kevinhu",
"log_id": filename
},
"header": {"uid": 1, "user": "kevinhu", "log_id": filename},
"request": {
"p": {
"request_id": "1",
"encrypt_type": "base64",
"filename": filename,
"langtype": '',
"fileori": base64.b64encode(binary).decode('utf-8')
},
"p": {"request_id": "1", "encrypt_type": "base64", "filename": filename, "langtype": "", "fileori": base64.b64encode(binary).decode("utf-8")},
"c": "resume_parse_module",
"m": "resume_parse"
}
"m": "resume_parse",
},
}
for _ in range(3):
try:
resume = requests.post(
"http://127.0.0.1:61670/tog",
data=json.dumps(q))
resume = requests.post("http://127.0.0.1:61670/tog", data=json.dumps(q))
resume = resume.json()["response"]["results"]
resume = refactor(resume)
for k in ["education", "work", "project",
"training", "skill", "certificate", "language"]:
if not resume.get(k) and k in resume:
del resume[k]
resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
resume = step_two.parse(resume)
return resume
except Exception:
logging.exception("Resume parser has not been supported yet!")
@ -103,23 +76,19 @@ def chunk(filename, binary=None, callback=None, **kwargs):
"expect_city_names_tks": "期望城市",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"first_school_name_tks": "第一学历毕业学校",
"first_degree_kwd": "第一学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"highest_degree_kwd": "最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_major_tks": "第一学历专业",
"edu_first_fea_kwd": "第一学历标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"degree_kwd": "过往学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"major_tks": "学过的专业/过往专业",
"school_name_tks": "学校/毕业院校",
"sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)",
"edu_fea_kwd": "教育标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
"edu_end_int": "毕业年份",
"industry_name_tks": "所在行业",
"birth_dt": "生日/出生年份",
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
}
@ -132,10 +101,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
if n.find("tks") > 0:
v = rmSpace(v)
titles.append(str(v))
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
}
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pairs = []
for n, m in field_map.items():
@ -148,23 +114,20 @@ def chunk(filename, binary=None, callback=None, **kwargs):
v = rmSpace(v)
pairs.append((m, str(v)))
doc["content_with_weight"] = "\n".join(
["{}: {}".format(re.sub(r"[^]+", "", k), v) for k, v in pairs])
doc["content_with_weight"] = "\n".join(["{}: {}".format(re.sub(r"[^]+", "", k), v) for k, v in pairs])
doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
for n, _ in field_map.items():
if n not in resume:
continue
if isinstance(resume[n], list) and (
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
resume[n] = resume[n][0]
if n.find("_tks") > 0:
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
doc[n] = resume[n]
logging.debug("chunked resume to " + str(doc))
KnowledgebaseService.update_parser_config(
kwargs["kb_id"], {"field_map": field_map})
KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": field_map})
return [doc]
@ -173,4 +136,5 @@ if __name__ == "__main__":
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -1,250 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import re
from io import BytesIO
from xpinyin import Pinyin
import numpy as np
import pandas as pd
# from openpyxl import load_workbook, Workbook
from dateutil.parser import parse as datetime_parse
from api.db.services.knowledgebase_service import KnowledgebaseService
from deepdoc.parser.utils import get_text
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import ExcelParser
class Excel(ExcelParser):
def __call__(self, fnm, binary=None, from_page=0,
to_page=10000000000, callback=None):
if not binary:
wb = Excel._load_excel_to_workbook(fnm)
else:
wb = Excel._load_excel_to_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
total += len(list(wb[sheetname].rows))
res, fails, done = [], [], 0
rn = 0
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
if not rows:
continue
headers = [cell.value for cell in rows[0]]
missed = set([i for i, h in enumerate(headers) if h is None])
headers = [
cell.value for i,
cell in enumerate(
rows[0]) if i not in missed]
if not headers:
continue
data = []
for i, r in enumerate(rows[1:]):
rn += 1
if rn - 1 < from_page:
continue
if rn - 1 >= to_page:
break
row = [
cell.value for ii,
cell in enumerate(r) if ii not in missed]
if len(row) != len(headers):
fails.append(str(i))
continue
data.append(row)
done += 1
if np.array(data).size == 0:
continue
res.append(pd.DataFrame(np.array(data), columns=headers))
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
def trans_datatime(s):
try:
return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
except Exception:
pass
def trans_bool(s):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$",
str(s).strip(), flags=re.IGNORECASE):
return "yes"
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
return "no"
def column_data_type(arr):
arr = list(arr)
counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
trans = {t: f for f, t in
[(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
for a in arr:
if a is None:
continue
if re.match(r"[+-]?[0-9]{,19}(\.0+)?$", str(a).replace("%%", "")):
counts["int"] += 1
elif re.match(r"[+-]?[0-9.]{,19}$", str(a).replace("%%", "")):
counts["float"] += 1
elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE):
counts["bool"] += 1
elif trans_datatime(str(a)):
counts["datetime"] += 1
else:
counts["text"] += 1
counts = sorted(counts.items(), key=lambda x: x[1] * -1)
ty = counts[0][0]
for i in range(len(arr)):
if arr[i] is None:
continue
try:
arr[i] = trans[ty](str(arr[i]))
except Exception:
arr[i] = None
# if ty == "text":
# if len(arr) > 128 and uni / len(arr) < 0.1:
# ty = "keyword"
return arr, ty
def chunk(filename, binary=None, from_page=0, to_page=10000000000,
lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA
Every row in table will be treated as a chunk.
"""
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
dfs = excel_parser(
filename,
binary,
from_page=from_page,
to_page=to_page,
callback=callback)
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
fails = []
headers = lines[0].split(kwargs.get("delimiter", "\t"))
rows = []
for i, line in enumerate(lines[1:]):
if i < from_page:
continue
if i >= to_page:
break
row = [field for field in line.split(kwargs.get("delimiter", "\t"))]
if len(row) != len(headers):
fails.append(str(i))
continue
rows.append(row)
callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
dfs = [pd.DataFrame(np.array(rows), columns=headers)]
else:
raise NotImplementedError(
"file type not supported yet(excel, text, csv supported)")
res = []
PY = Pinyin()
fieds_map = {
"text": "_tks",
"int": "_long",
"keyword": "_kwd",
"float": "_flt",
"datetime": "_dt",
"bool": "_kwd"}
for df in dfs:
for n in ["id", "_id", "index", "idx"]:
if n in df.columns:
del df[n]
clmns = df.columns.values
txts = list(copy.deepcopy(clmns))
py_clmns = [
PY.get_pinyins(
re.sub(
r"(/.*|[^]+?|\([^()]+?\))",
"",
str(n)),
'_')[0] for n in clmns]
clmn_tys = []
for j in range(len(clmns)):
cln, ty = column_data_type(df[clmns[j]])
clmn_tys.append(ty)
df[clmns[j]] = cln
if ty == "text":
txts.extend([str(c) for c in cln if c])
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " "))
for i in range(len(clmns))]
eng = lang.lower() == "english" # is_english(txts)
for ii, row in df.iterrows():
d = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
row_txt = []
for j in range(len(clmns)):
if row[clmns[j]] is None:
continue
if not str(row[clmns[j]]):
continue
if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]):
continue
fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
row[clmns[j]])
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
if not row_txt:
continue
tokenize(d, "; ".join(row_txt), eng)
res.append(d)
KnowledgebaseService.update_parser_config(
kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
callback(0.35, "")
return res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -17,10 +17,24 @@ import json
import re
import csv
from copy import deepcopy
from deepdoc.parser.utils import get_text
from rag.app.qa import Excel
from rag.nlp import rag_tokenizer
from rag.nlp import find_codec
def get_text(fnm: str, binary=None) -> str:
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r") as f:
while True:
line = f.readline()
if not line:
break
txt += line
return txt
def beAdoc(d, q, a, eng, row_num=-1):