Merge pull request #147 from zstar1003/dev

refactor(agent): 移除 deepdoc 相关代码，防止误导deepwiki
2025-06-05 10:14:36 +08:00 · 2025-06-05 10:14:36 +08:00 · 1fbb0d58c0
parent f9fa339eac 38d8932947
commit 1fbb0d58c0
60 changed files with 285 additions and 66406 deletions
--- a/1
+++ b/1
@ -7,7 +7,6 @@ WORKDIR /ragflow
 # 复制 Python 相关代码目录
 COPY api ./api
 COPY conf ./conf
-COPY deepdoc ./deepdoc
 COPY rag ./rag
 COPY agent ./agent
 COPY graphrag ./graphrag
--- a/agent/component/invoke.py
+++ b/agent/component/invoke.py
@ -17,7 +17,6 @@ import json
 import re
 from abc import ABC
 import requests
-from deepdoc.parser import HtmlParser
 from agent.component.base import ComponentBase, ComponentParamBase


@ -38,11 +37,11 @@ class InvokeParam(ComponentParamBase):
        self.datatype = "json"  # New parameter to determine data posting type

    def check(self):
-        self.check_valid_value(self.method.lower(), "Type of content from the crawler", ['get', 'post', 'put'])
+        self.check_valid_value(self.method.lower(), "Type of content from the crawler", ["get", "post", "put"])
        self.check_empty(self.url, "End point URL")
        self.check_positive_integer(self.timeout, "Timeout time in second")
        self.check_boolean(self.clean_html, "Clean HTML")
-        self.check_valid_value(self.datatype.lower(), "Data post type", ['json', 'formdata'])  # Check for valid datapost value
+        self.check_valid_value(self.datatype.lower(), "Data post type", ["json", "formdata"])  # Check for valid datapost value


 class Invoke(ComponentBase, ABC):
@ -52,9 +51,9 @@ class Invoke(ComponentBase, ABC):
        args = {}
        for para in self._param.variables:
            if para.get("component_id"):
-                if '@' in para["component_id"]:
-                    component = para["component_id"].split('@')[0]
-                    field = para["component_id"].split('@')[1]
+                if "@" in para["component_id"]:
+                    component = para["component_id"].split("@")[0]
+                    field = para["component_id"].split("@")[1]
                    cpn = self._canvas.get_component(component)["obj"]
                    for param in cpn._param.query:
                        if param["key"] == field:
@ -83,50 +82,27 @@ class Invoke(ComponentBase, ABC):
        if re.sub(r"https?:?/?/?", "", self._param.proxy):
            proxies = {"http": self._param.proxy, "https": self._param.proxy}

-        if method == 'get':
-            response = requests.get(url=url,
-                                    params=args,
-                                    headers=headers,
-                                    proxies=proxies,
-                                    timeout=self._param.timeout)
+        if method == "get":
+            response = requests.get(url=url, params=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
            if self._param.clean_html:
-                sections = HtmlParser()(None, response.content)
-                return Invoke.be_output("\n".join(sections))
+                return Invoke.be_output("\n")

            return Invoke.be_output(response.text)

-        if method == 'put':
-            if self._param.datatype.lower() == 'json':
-                response = requests.put(url=url,
-                                        json=args,
-                                        headers=headers,
-                                        proxies=proxies,
-                                        timeout=self._param.timeout)
+        if method == "put":
+            if self._param.datatype.lower() == "json":
+                response = requests.put(url=url, json=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
            else:
-                response = requests.put(url=url,
-                                        data=args,
-                                        headers=headers,
-                                        proxies=proxies,
-                                        timeout=self._param.timeout)
+                response = requests.put(url=url, data=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
            if self._param.clean_html:
-                sections = HtmlParser()(None, response.content)
-                return Invoke.be_output("\n".join(sections))
+                return Invoke.be_output("\n".join())
            return Invoke.be_output(response.text)

-        if method == 'post':
-            if self._param.datatype.lower() == 'json':
-                response = requests.post(url=url,
-                                         json=args,
-                                         headers=headers,
-                                         proxies=proxies,
-                                         timeout=self._param.timeout)
+        if method == "post":
+            if self._param.datatype.lower() == "json":
+                response = requests.post(url=url, json=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
            else:
-                response = requests.post(url=url,
-                                         data=args,
-                                         headers=headers,
-                                         proxies=proxies,
-                                         timeout=self._param.timeout)
+                response = requests.post(url=url, data=args, headers=headers, proxies=proxies, timeout=self._param.timeout)
            if self._param.clean_html:
-                sections = HtmlParser()(None, response.content)
-                return Invoke.be_output("\n".join(sections))
+                return Invoke.be_output("\n".join())
            return Invoke.be_output(response.text)
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -22,7 +22,6 @@ import flask
 from flask import request
 from flask_login import login_required, current_user

-from deepdoc.parser.html_parser import RAGFlowHtmlParser
 from rag.nlp import search

 from api.db import FileType, TaskStatus, ParserType, FileSource
@ -49,50 +48,44 @@ from api.utils.web_utils import html2pdf, is_valid_url
 from api.constants import IMG_BASE64_PREFIX


-@manager.route('/upload', methods=['POST'])  # noqa: F821
+@manager.route("/upload", methods=["POST"])  # noqa: F821
@login_required
@validate_request("kb_id")
 def upload():
    kb_id = request.form.get("kb_id")
    if not kb_id:
-        return get_json_result(
-            data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
-    if 'file' not in request.files:
-        return get_json_result(
-            data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+    if "file" not in request.files:
+        return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)

-    file_objs = request.files.getlist('file')
+    file_objs = request.files.getlist("file")
    for file_obj in file_objs:
-        if file_obj.filename == '':
-            return get_json_result(
-                data=False, message='No file selected!', code=settings.RetCode.ARGUMENT_ERROR)
+        if file_obj.filename == "":
+            return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)

    e, kb = KnowledgebaseService.get_by_id(kb_id)
    if not e:
        raise LookupError("Can't find this knowledgebase!")

    err, files = FileService.upload_document(kb, file_objs, current_user.id)
-    files = [f[0] for f in files] # remove the blob
-    
+    files = [f[0] for f in files]  # remove the blob
+
    if err:
-        return get_json_result(
-            data=files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
+        return get_json_result(data=files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
    return get_json_result(data=files)


-@manager.route('/web_crawl', methods=['POST'])  # noqa: F821
+@manager.route("/web_crawl", methods=["POST"])  # noqa: F821
@login_required
@validate_request("kb_id", "name", "url")
 def web_crawl():
    kb_id = request.form.get("kb_id")
    if not kb_id:
-        return get_json_result(
-            data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
    name = request.form.get("name")
    url = request.form.get("url")
    if not is_valid_url(url):
-        return get_json_result(
-            data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
    e, kb = KnowledgebaseService.get_by_id(kb_id)
    if not e:
        raise LookupError("Can't find this knowledgebase!")
@ -108,10 +101,7 @@ def web_crawl():
    kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])

    try:
-        filename = duplicate_name(
-            DocumentService.query,
-            name=name + ".pdf",
-            kb_id=kb.id)
+        filename = duplicate_name(DocumentService.query, name=name + ".pdf", kb_id=kb.id)
        filetype = filename_type(filename)
        if filetype == FileType.OTHER.value:
            raise RuntimeError("This type of file has not been supported yet!")
@ -130,7 +120,7 @@ def web_crawl():
            "name": filename,
            "location": location,
            "size": len(blob),
-            "thumbnail": thumbnail(filename, blob)
+            "thumbnail": thumbnail(filename, blob),
        }
        if doc["type"] == FileType.VISUAL:
            doc["parser_id"] = ParserType.PICTURE.value
@ -147,58 +137,53 @@ def web_crawl():
    return get_json_result(data=True)


-@manager.route('/create', methods=['POST'])  # noqa: F821
+@manager.route("/create", methods=["POST"])  # noqa: F821
@login_required
@validate_request("name", "kb_id")
 def create():
    req = request.json
    kb_id = req["kb_id"]
    if not kb_id:
-        return get_json_result(
-            data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)

    try:
        e, kb = KnowledgebaseService.get_by_id(kb_id)
        if not e:
-            return get_data_error_result(
-                message="Can't find this knowledgebase!")
+            return get_data_error_result(message="Can't find this knowledgebase!")

        if DocumentService.query(name=req["name"], kb_id=kb_id):
-            return get_data_error_result(
-                message="Duplicated document name in the same knowledgebase.")
+            return get_data_error_result(message="Duplicated document name in the same knowledgebase.")

-        doc = DocumentService.insert({
-            "id": get_uuid(),
-            "kb_id": kb.id,
-            "parser_id": kb.parser_id,
-            "parser_config": kb.parser_config,
-            "created_by": current_user.id,
-            "type": FileType.VIRTUAL,
-            "name": req["name"],
-            "location": "",
-            "size": 0
-        })
+        doc = DocumentService.insert(
+            {
+                "id": get_uuid(),
+                "kb_id": kb.id,
+                "parser_id": kb.parser_id,
+                "parser_config": kb.parser_config,
+                "created_by": current_user.id,
+                "type": FileType.VIRTUAL,
+                "name": req["name"],
+                "location": "",
+                "size": 0,
+            }
+        )
        return get_json_result(data=doc.to_json())
    except Exception as e:
        return server_error_response(e)


-@manager.route('/list', methods=['GET'])  # noqa: F821
+@manager.route("/list", methods=["GET"])  # noqa: F821
@login_required
 def list_docs():
    kb_id = request.args.get("kb_id")
    if not kb_id:
-        return get_json_result(
-            data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
    tenants = UserTenantService.query(user_id=current_user.id)
    for tenant in tenants:
-        if KnowledgebaseService.query(
-                tenant_id=tenant.tenant_id, id=kb_id):
+        if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
            break
    else:
-        return get_json_result(
-            data=False, message='Only owner of knowledgebase authorized for this operation.',
-            code=settings.RetCode.OPERATING_ERROR)
+        return get_json_result(data=False, message="Only owner of knowledgebase authorized for this operation.", code=settings.RetCode.OPERATING_ERROR)
    keywords = request.args.get("keywords", "")

    page_number = int(request.args.get("page", 1))
@ -206,70 +191,58 @@ def list_docs():
    orderby = request.args.get("orderby", "create_time")
    desc = request.args.get("desc", True)
    try:
-        docs, tol = DocumentService.get_by_kb_id(
-            kb_id, page_number, items_per_page, orderby, desc, keywords)
+        docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords)

        for doc_item in docs:
-            if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
-                doc_item['thumbnail'] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
+            if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
+                doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"

        return get_json_result(data={"total": tol, "docs": docs})
    except Exception as e:
        return server_error_response(e)


-@manager.route('/infos', methods=['POST'])  # noqa: F821
+@manager.route("/infos", methods=["POST"])  # noqa: F821
@login_required
 def docinfos():
    req = request.json
    doc_ids = req["doc_ids"]
    for doc_id in doc_ids:
        if not DocumentService.accessible(doc_id, current_user.id):
-            return get_json_result(
-                data=False,
-                message='No authorization.',
-                code=settings.RetCode.AUTHENTICATION_ERROR
-            )
+            return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    docs = DocumentService.get_by_ids(doc_ids)
    return get_json_result(data=list(docs.dicts()))


-@manager.route('/thumbnails', methods=['GET'])  # noqa: F821
+@manager.route("/thumbnails", methods=["GET"])  # noqa: F821
 # @login_required
 def thumbnails():
    doc_ids = request.args.get("doc_ids").split(",")
    if not doc_ids:
-        return get_json_result(
-            data=False, message='Lack of "Document ID"', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Lack of "Document ID"', code=settings.RetCode.ARGUMENT_ERROR)

    try:
        docs = DocumentService.get_thumbnails(doc_ids)

        for doc_item in docs:
-            if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
-                doc_item['thumbnail'] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"
+            if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
+                doc_item["thumbnail"] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"

        return get_json_result(data={d["id"]: d["thumbnail"] for d in docs})
    except Exception as e:
        return server_error_response(e)


-@manager.route('/change_status', methods=['POST'])  # noqa: F821
+@manager.route("/change_status", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id", "status")
 def change_status():
    req = request.json
    if str(req["status"]) not in ["0", "1"]:
-        return get_json_result(
-            data=False,
-            message='"Status" must be either 0 or 1!',
-            code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='"Status" must be either 0 or 1!', code=settings.RetCode.ARGUMENT_ERROR)

    if not DocumentService.accessible(req["doc_id"], current_user.id):
-        return get_json_result(
-            data=False,
-            message='No authorization.',
-            code=settings.RetCode.AUTHENTICATION_ERROR)
+        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)

    try:
        e, doc = DocumentService.get_by_id(req["doc_id"])
@ -277,23 +250,19 @@ def change_status():
            return get_data_error_result(message="Document not found!")
        e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
        if not e:
-            return get_data_error_result(
-                message="Can't find this knowledgebase!")
+            return get_data_error_result(message="Can't find this knowledgebase!")

-        if not DocumentService.update_by_id(
-                req["doc_id"], {"status": str(req["status"])}):
-            return get_data_error_result(
-                message="Database error (Document update)!")
+        if not DocumentService.update_by_id(req["doc_id"], {"status": str(req["status"])}):
+            return get_data_error_result(message="Database error (Document update)!")

        status = int(req["status"])
-        settings.docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status},
-                                     search.index_name(kb.tenant_id), doc.kb_id)
+        settings.docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
        return get_json_result(data=True)
    except Exception as e:
        return server_error_response(e)


-@manager.route('/rm', methods=['POST'])  # noqa: F821
+@manager.route("/rm", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id")
 def rm():
@ -304,11 +273,7 @@ def rm():

    for doc_id in doc_ids:
        if not DocumentService.accessible4deletion(doc_id, current_user.id):
-            return get_json_result(
-                data=False,
-                message='No authorization.',
-                code=settings.RetCode.AUTHENTICATION_ERROR
-            )
+            return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)

    root_folder = FileService.get_root_folder(current_user.id)
    pf_id = root_folder["id"]
@ -327,8 +292,7 @@ def rm():

            TaskService.filter_delete([Task.doc_id == doc_id])
            if not DocumentService.remove_document(doc, tenant_id):
-                return get_data_error_result(
-                    message="Database error (Document removal)!")
+                return get_data_error_result(message="Database error (Document removal)!")

            f2d = File2DocumentService.get_by_document_id(doc_id)
            FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
@ -344,20 +308,16 @@ def rm():
    return get_json_result(data=True)


-@manager.route('/run', methods=['POST'])  # noqa: F821
+@manager.route("/run", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_ids", "run")
-def run(): 
+def run():
    req = request.json
-    
+
    # 检查用户是否有权限操作这些文档
    for doc_id in req["doc_ids"]:
        if not DocumentService.accessible(doc_id, current_user.id):
-            return get_json_result(
-                data=False,
-                message='No authorization.',
-                code=settings.RetCode.AUTHENTICATION_ERROR
-            )
+            return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    try:
        # 遍历所有需要处理的文档ID
        for id in req["doc_ids"]:
@ -368,7 +328,7 @@ def run():
                info["progress_msg"] = ""
                info["chunk_num"] = 0
                info["token_num"] = 0
-            
+
            # 更新文档状态
            DocumentService.update_by_id(id, info)
            # 获取租户ID
@ -386,7 +346,7 @@ def run():
                # 如果索引存在，则删除索引中的文档数据
                if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
                    settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
-            
+
            # 如果是运行状态，则创建解析任务
            if str(req["run"]) == TaskStatus.RUNNING.value:
                e, doc = DocumentService.get_by_id(id)
@ -402,36 +362,25 @@ def run():
        return server_error_response(e)


-@manager.route('/rename', methods=['POST'])  # noqa: F821
+@manager.route("/rename", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id", "name")
 def rename():
    req = request.json
    if not DocumentService.accessible(req["doc_id"], current_user.id):
-        return get_json_result(
-            data=False,
-            message='No authorization.',
-            code=settings.RetCode.AUTHENTICATION_ERROR
-        )
+        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    try:
        e, doc = DocumentService.get_by_id(req["doc_id"])
        if not e:
            return get_data_error_result(message="Document not found!")
-        if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
-                doc.name.lower()).suffix:
-            return get_json_result(
-                data=False,
-                message="The extension of file can't be changed",
-                code=settings.RetCode.ARGUMENT_ERROR)
+        if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
+            return get_json_result(data=False, message="The extension of file can't be changed", code=settings.RetCode.ARGUMENT_ERROR)
        for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
            if d.name == req["name"]:
-                return get_data_error_result(
-                    message="Duplicated document name in the same knowledgebase.")
+                return get_data_error_result(message="Duplicated document name in the same knowledgebase.")

-        if not DocumentService.update_by_id(
-                req["doc_id"], {"name": req["name"]}):
-            return get_data_error_result(
-                message="Database error (Document rename)!")
+        if not DocumentService.update_by_id(req["doc_id"], {"name": req["name"]}):
+            return get_data_error_result(message="Database error (Document rename)!")

        informs = File2DocumentService.get_by_document_id(req["doc_id"])
        if informs:
@ -443,7 +392,7 @@ def rename():
        return server_error_response(e)


-@manager.route('/get/<doc_id>', methods=['GET'])  # noqa: F821
+@manager.route("/get/<doc_id>", methods=["GET"])  # noqa: F821
 # @login_required
 def get(doc_id):
    try:
@ -457,29 +406,22 @@ def get(doc_id):
        ext = re.search(r"\.([^.]+)$", doc.name)
        if ext:
            if doc.type == FileType.VISUAL.value:
-                response.headers.set('Content-Type', 'image/%s' % ext.group(1))
+                response.headers.set("Content-Type", "image/%s" % ext.group(1))
            else:
-                response.headers.set(
-                    'Content-Type',
-                    'application/%s' %
-                    ext.group(1))
+                response.headers.set("Content-Type", "application/%s" % ext.group(1))
        return response
    except Exception as e:
        return server_error_response(e)


-@manager.route('/change_parser', methods=['POST'])  # noqa: F821
+@manager.route("/change_parser", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id", "parser_id")
 def change_parser():
    req = request.json

    if not DocumentService.accessible(req["doc_id"], current_user.id):
-        return get_json_result(
-            data=False,
-            message='No authorization.',
-            code=settings.RetCode.AUTHENTICATION_ERROR
-        )
+        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    try:
        e, doc = DocumentService.get_by_id(req["doc_id"])
        if not e:
@ -491,21 +433,16 @@ def change_parser():
            else:
                return get_json_result(data=True)

-        if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
-                or (re.search(
-                    r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
+        if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
            return get_data_error_result(message="Not supported yet!")

-        e = DocumentService.update_by_id(doc.id,
-                                         {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
-                                          "run": TaskStatus.UNSTART.value})
+        e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value})
        if not e:
            return get_data_error_result(message="Document not found!")
        if "parser_config" in req:
            DocumentService.update_parser_config(doc.id, req["parser_config"])
        if doc.token_num > 0:
-            e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
-                                                    doc.process_duation * -1)
+            e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duation * -1)
            if not e:
                return get_data_error_result(message="Document not found!")
            tenant_id = DocumentService.get_tenant_id(req["doc_id"])
@ -519,7 +456,7 @@ def change_parser():
        return server_error_response(e)


-@manager.route('/image/<image_id>', methods=['GET'])  # noqa: F821
+@manager.route("/image/<image_id>", methods=["GET"])  # noqa: F821
 # @login_required
 def get_image(image_id):
    try:
@ -528,60 +465,52 @@ def get_image(image_id):
            return get_data_error_result(message="Image not found.")
        bkt, nm = image_id.split("-")
        response = flask.make_response(STORAGE_IMPL.get(bkt, nm))
-        response.headers.set('Content-Type', 'image/JPEG')
+        response.headers.set("Content-Type", "image/JPEG")
        return response
    except Exception as e:
        return server_error_response(e)


-@manager.route('/upload_and_parse', methods=['POST'])  # noqa: F821
+@manager.route("/upload_and_parse", methods=["POST"])  # noqa: F821
@login_required
@validate_request("conversation_id")
 def upload_and_parse():
-    if 'file' not in request.files:
-        return get_json_result(
-            data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
+    if "file" not in request.files:
+        return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)

-    file_objs = request.files.getlist('file')
+    file_objs = request.files.getlist("file")
    for file_obj in file_objs:
-        if file_obj.filename == '':
-            return get_json_result(
-                data=False, message='No file selected!', code=settings.RetCode.ARGUMENT_ERROR)
+        if file_obj.filename == "":
+            return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)

    doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)

    return get_json_result(data=doc_ids)


-@manager.route('/parse', methods=['POST'])  # noqa: F821
+@manager.route("/parse", methods=["POST"])  # noqa: F821
@login_required
 def parse():
    url = request.json.get("url") if request.json else ""
    if url:
        if not is_valid_url(url):
-            return get_json_result(
-                data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
+            return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
        download_path = os.path.join(get_project_base_directory(), "logs/downloads")
        os.makedirs(download_path, exist_ok=True)
        from seleniumwire.webdriver import Chrome, ChromeOptions
+
        options = ChromeOptions()
-        options.add_argument('--headless')
-        options.add_argument('--disable-gpu')
-        options.add_argument('--no-sandbox')
-        options.add_argument('--disable-dev-shm-usage')
-        options.add_experimental_option('prefs', {
-            'download.default_directory': download_path,
-            'download.prompt_for_download': False,
-            'download.directory_upgrade': True,
-            'safebrowsing.enabled': True
-        })
+        options.add_argument("--headless")
+        options.add_argument("--disable-gpu")
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_experimental_option("prefs", {"download.default_directory": download_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True})
        driver = Chrome(options=options)
        driver.get(url)
        res_headers = [r.response.headers for r in driver.requests if r and r.response]
        if len(res_headers) > 1:
-            sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
            driver.quit()
-            return get_json_result(data="\n".join(sections))
+            return get_json_result(data="\n".join())

        class File:
            filename: str
@ -597,51 +526,41 @@ def parse():

        r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
        if not r or not r.group(1):
-            return get_json_result(
-                data=False, message="Can't not identify downloaded file", code=settings.RetCode.ARGUMENT_ERROR)
+            return get_json_result(data=False, message="Can't not identify downloaded file", code=settings.RetCode.ARGUMENT_ERROR)
        f = File(r.group(1), os.path.join(download_path, r.group(1)))
        txt = FileService.parse_docs([f], current_user.id)
        return get_json_result(data=txt)

-    if 'file' not in request.files:
-        return get_json_result(
-            data=False, message='No file part!', code=settings.RetCode.ARGUMENT_ERROR)
+    if "file" not in request.files:
+        return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)

-    file_objs = request.files.getlist('file')
+    file_objs = request.files.getlist("file")
    txt = FileService.parse_docs(file_objs, current_user.id)

    return get_json_result(data=txt)


-@manager.route('/set_meta', methods=['POST'])  # noqa: F821
+@manager.route("/set_meta", methods=["POST"])  # noqa: F821
@login_required
@validate_request("doc_id", "meta")
 def set_meta():
    req = request.json
    if not DocumentService.accessible(req["doc_id"], current_user.id):
-        return get_json_result(
-            data=False,
-            message='No authorization.',
-            code=settings.RetCode.AUTHENTICATION_ERROR
-        )
+        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
    try:
        meta = json.loads(req["meta"])
    except Exception as e:
-        return get_json_result(
-            data=False, message=f'Json syntax error: {e}', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message=f"Json syntax error: {e}", code=settings.RetCode.ARGUMENT_ERROR)
    if not isinstance(meta, dict):
-        return get_json_result(
-            data=False, message='Meta data should be in Json map format, like {"key": "value"}', code=settings.RetCode.ARGUMENT_ERROR)
+        return get_json_result(data=False, message='Meta data should be in Json map format, like {"key": "value"}', code=settings.RetCode.ARGUMENT_ERROR)

    try:
        e, doc = DocumentService.get_by_id(req["doc_id"])
        if not e:
            return get_data_error_result(message="Document not found!")

-        if not DocumentService.update_by_id(
-                req["doc_id"], {"meta_fields": meta}):
-            return get_data_error_result(
-                message="Database error (meta updates)!")
+        if not DocumentService.update_by_id(req["doc_id"], {"meta_fields": meta}):
+            return get_data_error_result(message="Database error (meta updates)!")

        return get_json_result(data=True)
    except Exception as e:
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@ -19,7 +19,6 @@ import xxhash
 from datetime import datetime

 from api.db.db_utils import bulk_insert_into_db
-from deepdoc.parser import PdfParser
 from peewee import JOIN
 from api.db.db_models import DB, File2Document, File
 from api.db import StatusEnum, FileType, TaskStatus
@ -27,7 +26,6 @@ from api.db.db_models import Task, Document, Knowledgebase, Tenant
 from api.db.services.common_service import CommonService
 from api.db.services.document_service import DocumentService
 from api.utils import current_timestamp, get_uuid
-from deepdoc.parser.excel_parser import RAGFlowExcelParser
 from rag.settings import SVR_QUEUE_NAME
 from rag.utils.storage_factory import STORAGE_IMPL
 from rag.utils.redis_conn import REDIS_CONN
@ -40,8 +38,8 @@ def trim_header_by_lines(text: str, max_length) -> str:
    if len_text <= max_length:
        return text
    for i in range(len_text):
-        if text[i] == '\n' and len_text - i <= max_length:
-            return text[i + 1:]
+        if text[i] == "\n" and len_text - i <= max_length:
+            return text[i + 1 :]
    return text


@ -76,10 +74,10 @@ class TaskService(CommonService):
        ]
        docs = (
            cls.model.select(*fields)
-                .join(Document, on=(cls.model.doc_id == Document.id))
-                .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id))
-                .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
-                .where(cls.model.id == task_id)
+            .join(Document, on=(cls.model.doc_id == Document.id))
+            .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id))
+            .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
+            .where(cls.model.id == task_id)
        )
        docs = list(docs.dicts())
        if not docs:
@ -112,10 +110,7 @@ class TaskService(CommonService):
            cls.model.digest,
            cls.model.chunk_ids,
        ]
-        tasks = (
-            cls.model.select(*fields).order_by(cls.model.from_page.asc(), cls.model.create_time.desc())
-                .where(cls.model.doc_id == doc_id)
-        )
+        tasks = cls.model.select(*fields).order_by(cls.model.from_page.asc(), cls.model.create_time.desc()).where(cls.model.doc_id == doc_id)
        tasks = list(tasks.dicts())
        if not tasks:
            return None
@ -131,21 +126,19 @@ class TaskService(CommonService):
    def get_ongoing_doc_name(cls):
        with DB.lock("get_task", -1):
            docs = (
-                cls.model.select(
-                    *[Document.id, Document.kb_id, Document.location, File.parent_id]
-                )
-                    .join(Document, on=(cls.model.doc_id == Document.id))
-                    .join(
+                cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id])
+                .join(Document, on=(cls.model.doc_id == Document.id))
+                .join(
                    File2Document,
                    on=(File2Document.document_id == Document.id),
                    join_type=JOIN.LEFT_OUTER,
                )
-                    .join(
+                .join(
                    File,
                    on=(File2Document.file_id == File.id),
                    join_type=JOIN.LEFT_OUTER,
                )
-                    .where(
+                .where(
                    Document.status == StatusEnum.VALID.value,
                    Document.run == TaskStatus.RUNNING.value,
                    ~(Document.type == FileType.VIRTUAL.value),
@ -185,9 +178,7 @@ class TaskService(CommonService):
                progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
                cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
            if "progress" in info:
-                cls.model.update(progress=info["progress"]).where(
-                    cls.model.id == id
-                ).execute()
+                cls.model.update(progress=info["progress"]).where(cls.model.id == id).execute()
            return

        with DB.lock("update_progress", -1):
@ -196,23 +187,21 @@ class TaskService(CommonService):
                progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
                cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
            if "progress" in info:
-                cls.model.update(progress=info["progress"]).where(
-                    cls.model.id == id
-                ).execute()
+                cls.model.update(progress=info["progress"]).where(cls.model.id == id).execute()


 def queue_tasks(doc: dict, bucket: str, name: str):
    """
    将文档解析任务分割并加入队列处理。
-    
+
    该函数根据文档类型(PDF、表格等)将文档分割成多个子任务，计算任务摘要，
    检查是否可以重用之前的任务结果，并将未完成的任务加入Redis队列进行处理。
-    
+
    参数:
        doc (dict): 文档信息字典，包含id、type、parser_id、parser_config等信息
        bucket (str): 存储桶名称
        name (str): 文件名称
-        
+
    流程:
        1. 根据文档类型(PDF/表格)将文档分割成多个子任务
        2. 为每个任务生成唯一摘要(digest)
@ -221,10 +210,11 @@ def queue_tasks(doc: dict, bucket: str, name: str):
        5. 将新任务批量插入数据库
        6. 将未完成的任务加入Redis队列
    """
+
    def new_task():
        """
        创建一个新的任务字典，包含基本任务信息。
-        
+
        返回:
            dict: 包含任务ID、文档ID、进度和页面范围的任务字典
        """
@ -240,7 +230,7 @@ def queue_tasks(doc: dict, bucket: str, name: str):
        # 获取布局识别方式，默认为"DeepDOC"
        do_layout = doc["parser_config"].get("layout_recognize", "DeepDOC")
        # 获取PDF总页数
-        pages = PdfParser.total_page_number(doc["name"], file_bin)
+        pages = 1
        # 获取每个任务处理的页数，默认为12页
        page_size = doc["parser_config"].get("task_page_size", 12)
        # 对于学术论文类型，默认任务页数为22
@ -248,9 +238,9 @@ def queue_tasks(doc: dict, bucket: str, name: str):
            page_size = doc["parser_config"].get("task_page_size", 22)
        # 对于特定解析器或非DeepDOC布局识别，将整个文档作为一个任务处理
        if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC":
-            page_size = 10 ** 9
+            page_size = 10**9
        # 获取需要处理的页面范围，默认为全部页面
-        page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)]
+        page_ranges = doc["parser_config"].get("pages") or [(1, 10**5)]
        # 根据页面范围和任务页数分割任务
        for s, e in page_ranges:
            # 调整页码（从0开始）
@ -269,14 +259,6 @@ def queue_tasks(doc: dict, bucket: str, name: str):
    elif doc["parser_id"] == "table":
        # 从存储中获取文件内容
        file_bin = STORAGE_IMPL.get(bucket, name)
-        # 获取表格总行数
-        rn = RAGFlowExcelParser.row_number(doc["name"], file_bin)
-        # 每3000行作为一个任务
-        for i in range(0, rn, 3000):
-            task = new_task()
-            task["from_page"] = i
-            task["to_page"] = min(i + 3000, rn)
-            parse_task_array.append(task)
    # 其他类型文档，整个文档作为一个任务处理
    else:
        parse_task_array.append(new_task())
@ -321,8 +303,7 @@ def queue_tasks(doc: dict, bucket: str, name: str):
                chunk_ids.extend(task["chunk_ids"].split())
        # 从文档存储中删除这些块
        if chunk_ids:
-            settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]),
-                                         chunking_config["kb_id"])
+            settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
    # 更新文档的块数量
    DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})

@ -335,17 +316,14 @@ def queue_tasks(doc: dict, bucket: str, name: str):
    unfinished_task_array = [task for task in parse_task_array if task["progress"] < 1.0]
    # 将未完成的任务加入Redis队列
    for unfinished_task in unfinished_task_array:
-        assert REDIS_CONN.queue_product(
-            SVR_QUEUE_NAME, message=unfinished_task
-        ), "Can't access Redis. Please check the Redis' status."
+        assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=unfinished_task), "Can't access Redis. Please check the Redis' status."


 def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
    idx = 0
    while idx < len(prev_tasks):
        prev_task = prev_tasks[idx]
-        if prev_task.get("from_page", 0) == task.get("from_page", 0) \
-                and prev_task.get("digest", 0) == task.get("digest", ""):
+        if prev_task.get("from_page", 0) == task.get("from_page", 0) and prev_task.get("digest", 0) == task.get("digest", ""):
            break
        idx += 1

@ -356,12 +334,11 @@ def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config:
        return 0
    task["chunk_ids"] = prev_task["chunk_ids"]
    task["progress"] = 1.0
-    if "from_page" in task and "to_page" in task and int(task['to_page']) - int(task['from_page']) >= 10 ** 6:
+    if "from_page" in task and "to_page" in task and int(task["to_page"]) - int(task["from_page"]) >= 10**6:
        task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): "
    else:
        task["progress_msg"] = ""
-    task["progress_msg"] = " ".join(
-        [datetime.now().strftime("%H:%M:%S"), task["progress_msg"], "Reused previous task's chunks."])
+    task["progress_msg"] = " ".join([datetime.now().strftime("%H:%M:%S"), task["progress_msg"], "Reused previous task's chunks."])
    prev_task["chunk_ids"] = ""

    return len(task["chunk_ids"].split())
--- a/deepdoc/README.md
+++ b/deepdoc/README.md
@ -1,122 +0,0 @@
-English | [简体中文](./README_zh.md)
-
-# *Deep*Doc
-
- [1. Introduction](#1)
- [2. Vision](#2)
- [3. Parser](#3)
-
-<a name="1"></a>
-## 1. Introduction
-
-With a bunch of documents from various domains with various formats and along with diverse retrieval requirements, 
-an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
-There are 2 parts in *Deep*Doc so far: vision and parser. 
-You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR.
-```bash
-python deepdoc/vision/t_ocr.py -h
-usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
-
-options:
-  -h, --help            show this help message and exit
-  --inputs INPUTS       Directory where to store images or PDFs, or a file path to a single image or PDF
-  --output_dir OUTPUT_DIR
-                        Directory where to store the output images. Default: './ocr_outputs'
-```
-```bash
-python deepdoc/vision/t_recognizer.py -h
-usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
-
-options:
-  -h, --help            show this help message and exit
-  --inputs INPUTS       Directory where to store images or PDFs, or a file path to a single image or PDF
-  --output_dir OUTPUT_DIR
-                        Directory where to store the output images. Default: './layouts_outputs'
-  --threshold THRESHOLD
-                        A threshold to filter out detections. Default: 0.5
-  --mode {layout,tsr}   Task mode: layout recognition or table structure recognition
-```
-
-Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!!
-```bash
-export HF_ENDPOINT=https://hf-mirror.com
-```
-
-<a name="2"></a>
-## 2. Vision
-
-We use vision information to resolve problems as human being.
-  - OCR. Since a lot of documents presented as images or at least be able to transform to image, 
-    OCR is a very essential and fundamental or even universal solution for text extraction.
-    ```bash
-        python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
-     ```
-    The inputs could be directory to images or PDF, or a image or PDF. 
-    You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results,
-    txt files which contain the OCR text.
-    <div align="center" style="margin-top:20px;margin-bottom:20px;">
-    <img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
-    </div>
-
-  - Layout recognition. Documents from different domain may have various layouts, 
-    like, newspaper, magazine, book and résumé are distinct in terms of layout. 
-    Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not, 
-    or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption.
-    We have 10 basic layout components which covers most cases:
-      - Text
-      - Title
-      - Figure
-      - Figure caption
-      - Table
-      - Table caption
-      - Header
-      - Footer
-      - Reference
-      - Equation
-      
-     Have a try on the following command to see the layout detection results.
-     ```bash
-        python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
-     ```
-    The inputs could be directory to images or PDF, or a image or PDF. 
-    You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following:
-    <div align="center" style="margin-top:20px;margin-bottom:20px;">
-    <img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
-    </div>
-  
-  - Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text.
-    And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
-    Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM. 
-    We have five labels for TSR task:
-      - Column
-      - Row
-      - Column header
-      - Projected row header
-      - Spanning cell
-      
-    Have a try on the following command to see the layout detection results.
-     ```bash
-        python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
-     ```
-    The inputs could be directory to images or PDF, or a image or PDF. 
-    You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following:
-    <div align="center" style="margin-top:20px;margin-bottom:20px;">
-    <img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
-    </div>
-        
-<a name="3"></a>
-## 3. Parser
-
-Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser. 
-The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes:
-  - Text chunks with their own positions in PDF(page number and rectangular positions).
-  - Tables with cropped image from the PDF, and contents which has already translated into natural language sentences.
-  - Figures with caption and text in the figures.
-  
-### Résumé
-
-The résumé is a very complicated kind of document. A résumé which is composed of unstructured text 
-with various layouts could be resolved into structured data composed of nearly a hundred of fields.
-We haven't opened the parser yet, as we open the processing method after parsing procedure.
-
-    
--- a/deepdoc/README_zh.md
+++ b/deepdoc/README_zh.md
@ -1,116 +0,0 @@
-[English](./README.md) | 简体中文
-
-# *Deep*Doc
-
- [*Deep*Doc](#deepdoc)
-  - [1. 介绍](#1-介绍)
-  - [2. 视觉处理](#2-视觉处理)
-  - [3. 解析器](#3-解析器)
-    - [简历](#简历)
-
-<a name="1"></a>
-## 1. 介绍
-
-对于来自不同领域、具有不同格式和不同检索要求的大量文档，准确的分析成为一项极具挑战性的任务。*Deep*Doc 就是为了这个目的而诞生的。到目前为止，*Deep*Doc 中有两个组成部分：视觉处理和解析器。如果您对我们的OCR、布局识别和TSR结果感兴趣，您可以运行下面的测试程序。
-
-```bash
-python deepdoc/vision/t_ocr.py -h
-usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
-
-options:
-  -h, --help            show this help message and exit
-  --inputs INPUTS       Directory where to store images or PDFs, or a file path to a single image or PDF
-  --output_dir OUTPUT_DIR
-                        Directory where to store the output images. Default: './ocr_outputs'
-```
-
-```bash
-python deepdoc/vision/t_recognizer.py -h
-usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
-
-options:
-  -h, --help            show this help message and exit
-  --inputs INPUTS       Directory where to store images or PDFs, or a file path to a single image or PDF
-  --output_dir OUTPUT_DIR
-                        Directory where to store the output images. Default: './layouts_outputs'
-  --threshold THRESHOLD
-                        A threshold to filter out detections. Default: 0.5
-  --mode {layout,tsr}   Task mode: layout recognition or table structure recognition
-```
-
-HuggingFace为我们的模型提供服务。如果你在下载HuggingFace模型时遇到问题，这可能会有所帮助！！
-
-```bash
-export HF_ENDPOINT=https://hf-mirror.com
-```
-
-<a name="2"></a>
-## 2. 视觉处理
-
-作为人类，我们使用视觉信息来解决问题。
-
-  - **OCR（Optical Character Recognition，光学字符识别）**。由于许多文档都是以图像形式呈现的，或者至少能够转换为图像，因此OCR是文本提取的一个非常重要、基本，甚至通用的解决方案。
-
-    ```bash
-    python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
-    ```
-
-    输入可以是图像或PDF的目录，或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ，其中有演示结果位置的图像，以及包含OCR文本的txt文件。
-    
-    <div align="center" style="margin-top:20px;margin-bottom:20px;">
-    <img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
-    </div>
-
-  - 布局识别（Layout recognition）。来自不同领域的文件可能有不同的布局，如报纸、杂志、书籍和简历在布局方面是不同的。只有当机器有准确的布局分析时，它才能决定这些文本部分是连续的还是不连续的，或者这个部分需要表结构识别（Table Structure Recognition，TSR）来处理，或者这个部件是一个图形并用这个标题来描述。我们有10个基本布局组件，涵盖了大多数情况：
-      - 文本
-      - 标题
-      - 配图
-      - 配图标题
-      - 表格
-      - 表格标题
-      - 页头
-      - 页尾
-      - 参考引用
-      - 公式
-      
-     请尝试以下命令以查看布局检测结果。
-
-    ```bash
-    python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
-    ```
-
-    输入可以是图像或PDF的目录，或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ，其中有显示检测结果的图像，如下所示：
-    <div align="center" style="margin-top:20px;margin-bottom:20px;">
-    <img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
-    </div>
-  
-  - **TSR（Table Structure Recognition，表结构识别）**。数据表是一种常用的结构，用于表示包括数字或文本在内的数据。表的结构可能非常复杂，比如层次结构标题、跨单元格和投影行标题。除了TSR，我们还将内容重新组合成LLM可以很好理解的句子。TSR任务有五个标签：
-      - 列
-      - 行
-      - 列标题
-      - 行标题
-      - 合并单元格
-      
-    请尝试以下命令以查看布局检测结果。
-
-    ```bash
-    python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
-    ```
-
-    输入可以是图像或PDF的目录，或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ，其中包含图像和html页面，这些页面展示了以下检测结果：
-
-    <div align="center" style="margin-top:20px;margin-bottom:20px;">
-    <img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
-    </div>
-        
-<a name="3"></a>
-## 3. 解析器
-
-PDF、DOCX、EXCEL和PPT四种文档格式都有相应的解析器。最复杂的是PDF解析器，因为PDF具有灵活性。PDF解析器的输出包括：
-  - 在PDF中有自己位置的文本块（页码和矩形位置）。
-  - 带有PDF裁剪图像的表格，以及已经翻译成自然语言句子的内容。
-  - 图中带标题和文字的图。
-  
-### 简历
-
-简历是一种非常复杂的文档。由各种格式的非结构化文本构成的简历可以被解析为包含近百个字段的结构化数据。我们还没有启用解析器，因为在解析过程之后才会启动处理方法。
--- a/deepdoc/init.py
+++ b/deepdoc/init.py
@ -1,18 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-from beartype.claw import beartype_this_package
-beartype_this_package()
--- a/deepdoc/parser/init.py
+++ b/deepdoc/parser/init.py
@ -1,36 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
-from .docx_parser import RAGFlowDocxParser as DocxParser
-from .excel_parser import RAGFlowExcelParser as ExcelParser
-from .ppt_parser import RAGFlowPptParser as PptParser
-from .html_parser import RAGFlowHtmlParser as HtmlParser
-from .json_parser import RAGFlowJsonParser as JsonParser
-from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
-from .txt_parser import RAGFlowTxtParser as TxtParser
-
-__all__ = [
-    "PdfParser",
-    "PlainParser",
-    "DocxParser",
-    "ExcelParser",
-    "PptParser",
-    "HtmlParser",
-    "JsonParser",
-    "MarkdownParser",
-    "TxtParser",
-]
--- a/deepdoc/parser/docx_parser.py
+++ b/deepdoc/parser/docx_parser.py
@ -1,227 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-from docx import Document
-import re
-import pandas as pd
-from collections import Counter
-from rag.nlp import rag_tokenizer
-from io import BytesIO
-
-
-class RAGFlowDocxParser:
-    """
-    Word文档(.docx)解析器，用于提取文档中的文本内容和表格。
-    
-    该解析器能够：
-    1. 按页面范围提取文档中的段落文本及其样式
-    2. 识别文档中的表格并将其转换为结构化文本
-    3. 智能处理表格头部和内容，生成语义化的文本描述
-    """
-
-    def __extract_table_content(self, tb):
-        """
-        从Word表格对象中提取内容并转换为DataFrame
-        
-        参数:
-            tb: docx库的Table对象
-            
-        返回:
-            处理后的表格内容文本列表
-        """
-        df = []
-        for row in tb.rows:
-            df.append([c.text for c in row.cells])
-        return self.__compose_table_content(pd.DataFrame(df))
-
-    def __compose_table_content(self, df):
-        """
-        将表格DataFrame转换为语义化的文本描述
-        
-        通过识别表格的结构特征(如表头、数据类型等)，将表格转换为更易于理解的文本形式
-        
-        参数:
-            df: 包含表格内容的DataFrame
-            
-        返回:
-            表格内容的文本表示列表
-        """
-
-        def blockType(b):
-            """
-            识别单元格内容的类型
-            
-            通过正则表达式和文本特征分析，将单元格内容分类为不同类型：
-            - Dt: 日期类型
-            - Nu: 数字类型
-            - Ca: 代码/ID类型
-            - En: 英文文本
-            - NE: 数字和文本混合
-            - Sg: 单字符
-            - Tx: 短文本
-            - Lx: 长文本
-            - Nr: 人名
-            - Ot: 其他类型
-            
-            参数:
-                b: 单元格文本内容
-                
-            返回:
-                内容类型的字符串标识
-            """
-            patt = [
-                ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
-                (r"^(20|19)[0-9]{2}年$", "Dt"),
-                (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
-                ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
-                (r"^第*[一二三四1-4]季度$", "Dt"),
-                (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
-                (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
-                ("^[0-9.,+%/ -]+$", "Nu"),
-                (r"^[0-9A-Z/\._~-]+$", "Ca"),
-                (r"^[A-Z]*[a-z' -]+$", "En"),
-                (r"^[0-9.,+-]+[0-9A-Za-z/$￥%<>（）()' -]+$", "NE"),
-                (r"^.{1}$", "Sg")
-            ]
-            for p, n in patt:
-                if re.search(p, b):
-                    return n
-            tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
-            if len(tks) > 3:
-                if len(tks) < 12:
-                    return "Tx"
-                else:
-                    return "Lx"
-
-            if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
-                return "Nr"
-
-            return "Ot"
-
-        # 表格至少需要两行才能处理
-        if len(df) < 2:
-            return []
-            
-        # 统计表格中最常见的内容类型
-        max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
-            1, len(df)) for j in range(len(df.iloc[i, :]))])
-        max_type = max(max_type.items(), key=lambda x: x[1])[0]
-
-        # 获取表格列数
-        colnm = len(df.iloc[0, :])
-        # 默认第一行为表头
-        hdrows = [0]  # 表头不一定出现在第一行
-        
-        # 如果表格主要是数字类型，则识别非数字行作为表头
-        if max_type == "Nu":
-            for r in range(1, len(df)):
-                tys = Counter([blockType(str(df.iloc[r, j]))
-                              for j in range(len(df.iloc[r, :]))])
-                tys = max(tys.items(), key=lambda x: x[1])[0]
-                if tys != max_type:
-                    hdrows.append(r)
-
-        # 处理表格内容，将每行转换为文本
-        lines = []
-        for i in range(1, len(df)):
-            # 跳过表头行
-            if i in hdrows:
-                continue
-                
-            # 计算当前行之前的表头行
-            hr = [r - i for r in hdrows]
-            hr = [r for r in hr if r < 0]
-            
-            # 找到最近的连续表头行
-            t = len(hr) - 1
-            while t > 0:
-                if hr[t] - hr[t - 1] > 1:
-                    hr = hr[t:]
-                    break
-                t -= 1
-                
-            # 为每列构建表头描述
-            headers = []
-            for j in range(len(df.iloc[i, :])):
-                t = []
-                for h in hr:
-                    x = str(df.iloc[i + h, j]).strip()
-                    if x in t:
-                        continue
-                    t.append(x)
-                t = ",".join(t)
-                if t:
-                    t += ": "
-                headers.append(t)
-                
-            # 构建每行的文本表示
-            cells = []
-            for j in range(len(df.iloc[i, :])):
-                if not str(df.iloc[i, j]):
-                    continue
-                cells.append(headers[j] + str(df.iloc[i, j]))
-            lines.append(";".join(cells))
-
-        # 根据列数决定返回格式
-        if colnm > 3:
-            return lines
-        return ["\n".join(lines)]
-
-    def __call__(self, fnm, from_page=0, to_page=100000000):
-        """
-        解析Word文档，提取指定页面范围内的文本和表格
-        
-        参数:
-            fnm: 文件名或二进制内容
-            from_page: 起始页码(从0开始)
-            to_page: 结束页码
-            
-        返回:
-            元组(secs, tbls)，其中:
-            - secs: 段落内容列表，每项为(文本内容, 样式名称)的元组
-            - tbls: 表格内容列表
-        """
-        # 根据输入类型创建Document对象
-        self.doc = Document(fnm) if isinstance(
-            fnm, str) else Document(BytesIO(fnm))
-        pn = 0 # 当前解析页码
-        secs = [] # 存储解析的段落内容
-        
-        # 遍历文档中的所有段落
-        for p in self.doc.paragraphs:
-            # 如果超出指定页码范围，停止解析
-            if pn > to_page:
-                break
-
-            runs_within_single_paragraph = [] # 保存在页面范围内的文本片段
-            # 遍历段落中的所有文本片段(run)
-            for run in p.runs:
-                if pn > to_page:
-                    break
-                # 如果当前页码在指定范围内且段落有内容，则添加文本
-                if from_page <= pn < to_page and p.text.strip():
-                    runs_within_single_paragraph.append(run.text) # 先添加文本片段
-
-                # 检查页面分隔符
-                if 'lastRenderedPageBreak' in run._element.xml:
-                    pn += 1
-
-            # 将段落文本和样式添加到结果列表
-            secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # 然后将文本片段连接为段落的一部分
-
-        # 提取所有表格内容
-        tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
-        return secs, tbls
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@ -1,150 +0,0 @@
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import sys
-from io import BytesIO
-
-import pandas as pd
-from openpyxl import Workbook, load_workbook
-
-from rag.nlp import find_codec
-
-
-class RAGFlowExcelParser:
-
-    @staticmethod
-    def _load_excel_to_workbook(file_like_object):
-        if isinstance(file_like_object, bytes):
-            file_like_object = BytesIO(file_like_object)
-
-        # Read first 4 bytes to determine file type
-        file_like_object.seek(0)
-        file_head = file_like_object.read(4)
-        file_like_object.seek(0)
-
-        if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
-            logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook")
-
-            try:
-                file_like_object.seek(0)
-                df = pd.read_csv(file_like_object)
-                return RAGFlowExcelParser._dataframe_to_workbook(df)
-
-            except Exception as e_csv:
-                raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}")
-
-        try:
-            return load_workbook(file_like_object)
-        except Exception as e:
-            logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
-            try:
-                file_like_object.seek(0)
-                df = pd.read_excel(file_like_object)
-                return RAGFlowExcelParser._dataframe_to_workbook(df)
-            except Exception as e_pandas:
-                raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
-
-    @staticmethod
-    def _dataframe_to_workbook(df):
-        wb = Workbook()
-        ws = wb.active
-        ws.title = "Data"
-
-        for col_num, column_name in enumerate(df.columns, 1):
-            ws.cell(row=1, column=col_num, value=column_name)
-
-        for row_num, row in enumerate(df.values, 2):
-            for col_num, value in enumerate(row, 1):
-                ws.cell(row=row_num, column=col_num, value=value)
-
-        return wb
-
-    def html(self, fnm, chunk_rows=256):
-        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
-        wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
-        tb_chunks = []
-        for sheetname in wb.sheetnames:
-            ws = wb[sheetname]
-            rows = list(ws.rows)
-            if not rows:
-                continue
-
-            tb_rows_0 = "<tr>"
-            for t in list(rows[0]):
-                tb_rows_0 += f"<th>{t.value}</th>"
-            tb_rows_0 += "</tr>"
-
-            for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
-                tb = ""
-                tb += f"<table><caption>{sheetname}</caption>"
-                tb += tb_rows_0
-                for r in list(
-                    rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
-                ):
-                    tb += "<tr>"
-                    for i, c in enumerate(r):
-                        if c.value is None:
-                            tb += "<td></td>"
-                        else:
-                            tb += f"<td>{c.value}</td>"
-                    tb += "</tr>"
-                tb += "</table>\n"
-                tb_chunks.append(tb)
-
-        return tb_chunks
-
-    def __call__(self, fnm):
-        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
-        wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
-
-        res = []
-        for sheetname in wb.sheetnames:
-            ws = wb[sheetname]
-            rows = list(ws.rows)
-            if not rows:
-                continue
-            ti = list(rows[0])
-            for r in list(rows[1:]):
-                fields = []
-                for i, c in enumerate(r):
-                    if not c.value:
-                        continue
-                    t = str(ti[i].value) if i < len(ti) else ""
-                    t += ("：" if t else "") + str(c.value)
-                    fields.append(t)
-                line = "; ".join(fields)
-                if sheetname.lower().find("sheet") < 0:
-                    line += " ——" + sheetname
-                res.append(line)
-        return res
-
-    @staticmethod
-    def row_number(fnm, binary):
-        if fnm.split(".")[-1].lower().find("xls") >= 0:
-            wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
-            total = 0
-            for sheetname in wb.sheetnames:
-                ws = wb[sheetname]
-                total += len(list(ws.rows))
-            return total
-
-        if fnm.split(".")[-1].lower() in ["csv", "txt"]:
-            encoding = find_codec(binary)
-            txt = binary.decode(encoding, errors="ignore")
-            return len(txt.split("\n"))
-
-
-if __name__ == "__main__":
-    psr = RAGFlowExcelParser()
-    psr(sys.argv[1])
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@ -1,50 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-from rag.nlp import find_codec
-import readability
-import html_text
-import chardet
-
-
-def get_encoding(file):
-    with open(file,'rb') as f:
-        tmp = chardet.detect(f.read())
-        return tmp['encoding']
-
-
-class RAGFlowHtmlParser:
-    def __call__(self, fnm, binary=None):
-        txt = ""
-        if binary:
-            encoding = find_codec(binary)
-            txt = binary.decode(encoding, errors="ignore")
-        else:
-            with open(fnm, "r",encoding=get_encoding(fnm)) as f:
-                txt = f.read()
-        return self.parser_txt(txt)
-
-    @classmethod
-    def parser_txt(cls, txt):
-        if not isinstance(txt, str):
-            raise TypeError("txt type should be str!")
-        html_doc = readability.Document(txt)
-        title = html_doc.title()
-        content = html_text.extract_text(html_doc.summary(html_partial=True))
-        txt = f"{title}\n{content}"
-        sections = txt.split("\n")
-        return sections
--- a/deepdoc/parser/json_parser.py
+++ b/deepdoc/parser/json_parser.py
@ -1,133 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-# The following documents are mainly referenced, and only adaptation modifications have been made
-# from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
-
-import json
-from typing import Any
-
-from rag.nlp import find_codec
-class RAGFlowJsonParser:
-    def __init__(
-        self, max_chunk_size: int = 2000, min_chunk_size: int | None = None
-    ):
-        super().__init__()
-        self.max_chunk_size = max_chunk_size * 2
-        self.min_chunk_size = (
-            min_chunk_size
-            if min_chunk_size is not None
-            else max(max_chunk_size - 200, 50)
-        )
-
-    def __call__(self, binary):
-        encoding = find_codec(binary)
-        txt = binary.decode(encoding, errors="ignore")
-        json_data = json.loads(txt)
-        chunks = self.split_json(json_data, True)   
-        sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
-        return sections
-
-    @staticmethod
-    def _json_size(data: dict) -> int:
-        """Calculate the size of the serialized JSON object."""
-        return len(json.dumps(data, ensure_ascii=False))
-
-    @staticmethod
-    def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
-        """Set a value in a nested dictionary based on the given path."""
-        for key in path[:-1]:
-            d = d.setdefault(key, {})
-        d[path[-1]] = value
-
-    def _list_to_dict_preprocessing(self, data: Any) -> Any:
-        if isinstance(data, dict):
-            # Process each key-value pair in the dictionary
-            return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
-        elif isinstance(data, list):
-            # Convert the list to a dictionary with index-based keys
-            return {
-                str(i): self._list_to_dict_preprocessing(item)
-                for i, item in enumerate(data)
-            }
-        else:
-            # Base case: the item is neither a dict nor a list, so return it unchanged
-            return data
-        
-    def _json_split(
-        self,
-        data,
-        current_path: list[str] | None,
-        chunks: list[dict] | None,
-    ) -> list[dict]:
-        """
-        Split json into maximum size dictionaries while preserving structure.
-        """
-        current_path = current_path or []
-        chunks = chunks or [{}]
-        if isinstance(data, dict):
-            for key, value in data.items():
-                new_path = current_path + [key]
-                chunk_size = self._json_size(chunks[-1])
-                size = self._json_size({key: value})
-                remaining = self.max_chunk_size - chunk_size
-
-                if size < remaining:
-                    # Add item to current chunk
-                    self._set_nested_dict(chunks[-1], new_path, value)
-                else:
-                    if chunk_size >= self.min_chunk_size:
-                        # Chunk is big enough, start a new chunk
-                        chunks.append({})
-
-                    # Iterate
-                    self._json_split(value, new_path, chunks)
-        else:
-            # handle single item
-            self._set_nested_dict(chunks[-1], current_path, data)
-        return chunks
-
-    def split_json(
-        self,
-        json_data,
-        convert_lists: bool = False,
-    ) -> list[dict]:
-        """Splits JSON into a list of JSON chunks"""
-
-        if convert_lists:
-            preprocessed_data = self._list_to_dict_preprocessing(json_data)
-            chunks = self._json_split(preprocessed_data, None, None)
-        else:
-            chunks = self._json_split(json_data, None, None)
-
-        # Remove the last chunk if it's empty
-        if not chunks[-1]:
-            chunks.pop()
-        return chunks
-
-    def split_text(
-        self,
-        json_data: dict[str, Any],
-        convert_lists: bool = False,
-        ensure_ascii: bool = True,
-    ) -> list[str]:
-        """Splits JSON into a list of JSON formatted strings"""
-
-        chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
-
-        # Convert to string
-        return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@ -1,77 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import re
-
-class RAGFlowMarkdownParser:
-    def __init__(self, chunk_token_num=128):
-        self.chunk_token_num = int(chunk_token_num)
-
-    def extract_tables_and_remainder(self, markdown_text):
-        tables = []
-        remainder = markdown_text
-        if "|" in markdown_text: # for optimize performance
-            # Standard Markdown table
-            border_table_pattern = re.compile(
-                r'''
-                (?:\n|^)                     
-                (?:\|.*?\|.*?\|.*?\n)        
-                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) 
-                (?:\|.*?\|.*?\|.*?\n)+
-            ''', re.VERBOSE)
-            border_tables = border_table_pattern.findall(markdown_text)
-            tables.extend(border_tables)
-            remainder = border_table_pattern.sub('', remainder)
-
-            # Borderless Markdown table
-            no_border_table_pattern = re.compile(
-                r'''
-                (?:\n|^)                 
-                (?:\S.*?\|.*?\n)
-                (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
-                (?:\S.*?\|.*?\n)+
-                ''', re.VERBOSE)
-            no_border_tables = no_border_table_pattern.findall(remainder)
-            tables.extend(no_border_tables)
-            remainder = no_border_table_pattern.sub('', remainder)
-
-        if "<table>" in remainder.lower(): # for optimize performance
-            #HTML table extraction - handle possible html/body wrapper tags
-            html_table_pattern = re.compile(
-            r'''
-            (?:\n|^)
-            \s*
-            (?:
-                # case1: <html><body><table>...</table></body></html>
-                (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
-                |
-                # case2: <body><table>...</table></body>
-                (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
-                |
-                # case3: only<table>...</table>
-                (?:<table[^>]*>.*?</table>)
-            )
-            \s*
-            (?=\n|$)
-            ''',
-            re.VERBOSE | re.DOTALL | re.IGNORECASE
-            )
-            html_tables = html_table_pattern.findall(remainder)
-            tables.extend(html_tables)
-            remainder = html_table_pattern.sub('', remainder)
-
-        return remainder, tables
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
--- a/deepdoc/parser/ppt_parser.py
+++ b/deepdoc/parser/ppt_parser.py
@ -1,68 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-from io import BytesIO
-from pptx import Presentation
-
-
-class RAGFlowPptParser:
-    def __init__(self):
-        super().__init__()
-
-    def __extract(self, shape):
-        if shape.shape_type == 19:
-            tb = shape.table
-            rows = []
-            for i in range(1, len(tb.rows)):
-                rows.append("; ".join([tb.cell(
-                    0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
-            return "\n".join(rows)
-
-        if shape.has_text_frame:
-            return shape.text_frame.text
-
-        if shape.shape_type == 6:
-            texts = []
-            for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
-                t = self.__extract(p)
-                if t:
-                    texts.append(t)
-            return "\n".join(texts)
-
-    def __call__(self, fnm, from_page, to_page, callback=None):
-        ppt = Presentation(fnm) if isinstance(
-            fnm, str) else Presentation(
-            BytesIO(fnm))
-        txts = []
-        self.total_page = len(ppt.slides)
-        for i, slide in enumerate(ppt.slides):
-            if i < from_page:
-                continue
-            if i >= to_page:
-                break
-            texts = []
-            for shape in sorted(
-                    slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
-                try:
-                    txt = self.__extract(shape)
-                    if txt:
-                        texts.append(txt)
-                except Exception as e:
-                    logging.exception(e)
-            txts.append("\n".join(texts))
-
-        return txts
--- a/deepdoc/parser/resume/init.py
+++ b/deepdoc/parser/resume/init.py
@ -1,109 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import datetime
-
-
-def refactor(cv):
-    for n in [
-        "raw_txt",
-        "parser_name",
-        "inference",
-        "ori_text",
-        "use_time",
-        "time_stat",
-    ]:
-        if n in cv and cv[n] is not None:
-            del cv[n]
-    cv["is_deleted"] = 0
-    if "basic" not in cv:
-        cv["basic"] = {}
-    if cv["basic"].get("photo2"):
-        del cv["basic"]["photo2"]
-
-    for n in [
-        "education",
-        "work",
-        "certificate",
-        "project",
-        "language",
-        "skill",
-        "training",
-    ]:
-        if n not in cv or cv[n] is None:
-            continue
-        if isinstance(cv[n], dict):
-            cv[n] = [v for _, v in cv[n].items()]
-        if not isinstance(cv[n], list):
-            del cv[n]
-            continue
-        vv = []
-        for v in cv[n]:
-            if "external" in v and v["external"] is not None:
-                del v["external"]
-            vv.append(v)
-        cv[n] = {str(i): vv[i] for i in range(len(vv))}
-
-    basics = [
-        ("basic_salary_month", "salary_month"),
-        ("expect_annual_salary_from", "expect_annual_salary"),
-    ]
-    for n, t in basics:
-        if cv["basic"].get(n):
-            cv["basic"][t] = cv["basic"][n]
-            del cv["basic"][n]
-
-    work = sorted(
-        [v for _, v in cv.get("work", {}).items()],
-        key=lambda x: x.get("start_time", ""),
-    )
-    edu = sorted(
-        [v for _, v in cv.get("education", {}).items()],
-        key=lambda x: x.get("start_time", ""),
-    )
-
-    if work:
-        cv["basic"]["work_start_time"] = work[0].get("start_time", "")
-        cv["basic"]["management_experience"] = (
-            "Y"
-            if any([w.get("management_experience", "") == "Y" for w in work])
-            else "N"
-        )
-        cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
-
-        for n in [
-            "annual_salary_from",
-            "annual_salary_to",
-            "industry_name",
-            "position_name",
-            "responsibilities",
-            "corporation_type",
-            "scale",
-            "corporation_name",
-        ]:
-            cv["basic"][n] = work[-1].get(n, "")
-
-    if edu:
-        for n in ["school_name", "discipline_name"]:
-            if n in edu[-1]:
-                cv["basic"][n] = edu[-1][n]
-
-    cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    if "contact" not in cv:
-        cv["contact"] = {}
-    if not cv["contact"].get("name"):
-        cv["contact"]["name"] = cv["basic"].get("name", "")
-    return cv
--- a/deepdoc/parser/resume/entities/init.py
+++ b/deepdoc/parser/resume/entities/init.py
@ -1,15 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
--- a/deepdoc/parser/resume/entities/corporations.py
+++ b/deepdoc/parser/resume/entities/corporations.py
@ -1,128 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import re
-import json
-import os
-import pandas as pd
-from rag.nlp import rag_tokenizer
-from . import regions
-
-
-current_file_path = os.path.dirname(os.path.abspath(__file__))
-GOODS = pd.read_csv(
-    os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0
-).fillna(0)
-GOODS["cid"] = GOODS["cid"].astype(str)
-GOODS = GOODS.set_index(["cid"])
-CORP_TKS = json.load(
-    open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r",encoding="utf-8")
-)
-GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r",encoding="utf-8"))
-CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r",encoding="utf-8"))
-
-
-def baike(cid, default_v=0):
-    global GOODS
-    try:
-        return GOODS.loc[str(cid), "len"]
-    except Exception:
-        pass
-    return default_v
-
-
-def corpNorm(nm, add_region=True):
-    global CORP_TKS
-    if not nm or not isinstance(nm, str):
-        return ""
-    nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
-    nm = re.sub(r"&amp;", "&", nm)
-    nm = re.sub(r"[\(\)（）\+'\"\t \*\\【】-]+", " ", nm)
-    nm = re.sub(
-        r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE
-    )
-    nm = re.sub(
-        r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
-        "",
-        nm,
-        10000,
-        re.IGNORECASE,
-    )
-    if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
-        return nm
-
-    tks = rag_tokenizer.tokenize(nm).split()
-    reg = [t for i, t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
-    nm = ""
-    for t in tks:
-        if regions.isName(t) or t in CORP_TKS:
-            continue
-        if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):
-            nm += " "
-        nm += t
-
-    r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
-    if r:
-        nm = r.group(1)
-    r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
-    if r:
-        nm = r.group(1)
-    return nm.strip() + (("" if not reg else "(%s)" % reg[0]) if add_region else "")
-
-
-def rmNoise(n):
-    n = re.sub(r"[\(（][^()（）]+[)）]", "", n)
-    n = re.sub(r"[,. &（）()]+", "", n)
-    return n
-
-
-GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
-for c, v in CORP_TAG.items():
-    cc = corpNorm(rmNoise(c), False)
-    if not cc:
-        logging.debug(c)
-CORP_TAG = {corpNorm(rmNoise(c), False): v for c, v in CORP_TAG.items()}
-
-
-def is_good(nm):
-    global GOOD_CORP
-    if nm.find("外派") >= 0:
-        return False
-    nm = rmNoise(nm)
-    nm = corpNorm(nm, False)
-    for n in GOOD_CORP:
-        if re.match(r"[0-9a-zA-Z]+$", n):
-            if n == nm:
-                return True
-        elif nm.find(n) >= 0:
-            return True
-    return False
-
-
-def corp_tag(nm):
-    global CORP_TAG
-    nm = rmNoise(nm)
-    nm = corpNorm(nm, False)
-    for n in CORP_TAG.keys():
-        if re.match(r"[0-9a-zA-Z., ]+$", n):
-            if n == nm:
-                return CORP_TAG[n]
-        elif nm.find(n) >= 0:
-            if len(n) < 3 and len(nm) / len(n) >= 2:
-                continue
-            return CORP_TAG[n]
-    return []
--- a/deepdoc/parser/resume/entities/degrees.py
+++ b/deepdoc/parser/resume/entities/degrees.py
@ -1,44 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-TBL = {
-    "94": "EMBA",
-    "6": "MBA",
-    "95": "MPA",
-    "92": "专升本",
-    "4": "专科",
-    "90": "中专",
-    "91": "中技",
-    "86": "初中",
-    "3": "博士",
-    "10": "博士后",
-    "1": "本科",
-    "2": "硕士",
-    "87": "职高",
-    "89": "高中",
-}
-
-TBL_ = {v: k for k, v in TBL.items()}
-
-
-def get_name(id):
-    return TBL.get(str(id), "")
-
-
-def get_id(nm):
-    if not nm:
-        return ""
-    return TBL_.get(nm.upper().strip(), "")
--- a/deepdoc/parser/resume/entities/industries.py
+++ b/deepdoc/parser/resume/entities/industries.py
@ -1,712 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-TBL = {
-    "1": {"name": "IT/通信/电子", "parent": "0"},
-    "2": {"name": "互联网", "parent": "0"},
-    "3": {"name": "电子商务", "parent": "2"},
-    "4": {"name": "互联网金融", "parent": "2"},
-    "5": {"name": "网络游戏", "parent": "2"},
-    "6": {"name": "社交网络平台", "parent": "2"},
-    "7": {"name": "视频音乐", "parent": "2"},
-    "9": {"name": "安全", "parent": "2"},
-    "10": {"name": "云计算", "parent": "2"},
-    "12": {"name": "工具类客户端应用", "parent": "2"},
-    "13": {"name": "互联网广告", "parent": "2"},
-    "14": {"name": "企业互联网服务", "parent": "2"},
-    "16": {"name": "在线教育", "parent": "2"},
-    "17": {"name": "在线医疗", "parent": "2"},
-    "19": {"name": "B2B", "parent": "3"},
-    "20": {"name": "B2C", "parent": "3"},
-    "21": {"name": "C2C", "parent": "3"},
-    "22": {"name": "生活信息本地化", "parent": "3"},
-    "23": {"name": "在线旅游", "parent": "2"},
-    "24": {"name": "第三方支付", "parent": "4"},
-    "26": {"name": "客户端游戏", "parent": "5"},
-    "27": {"name": "网页游戏", "parent": "5"},
-    "28": {"name": "手机游戏", "parent": "5"},
-    "29": {"name": "微博", "parent": "6"},
-    "30": {"name": "社交网站", "parent": "6"},
-    "31": {"name": "在线视频", "parent": "7"},
-    "32": {"name": "在线音乐", "parent": "7"},
-    "35": {"name": "企业安全", "parent": "9"},
-    "36": {"name": "个人安全", "parent": "9"},
-    "37": {"name": "企业级云服务", "parent": "10"},
-    "38": {"name": "个人级云服务", "parent": "10"},
-    "43": {"name": "输入法", "parent": "12"},
-    "44": {"name": "浏览器", "parent": "12"},
-    "45": {"name": "词典", "parent": "12"},
-    "46": {"name": "播放器", "parent": "12"},
-    "47": {"name": "下载器", "parent": "12"},
-    "48": {"name": "IM", "parent": "12"},
-    "49": {"name": "广告服务", "parent": "13"},
-    "50": {"name": "第三方广告网络平台", "parent": "13"},
-    "51": {"name": "媒体代理", "parent": "13"},
-    "52": {"name": "创意代理", "parent": "13"},
-    "53": {"name": "IT-综合", "parent": "1"},
-    "71": {"name": "团购", "parent": "3"},
-    "72": {"name": "地图", "parent": "2"},
-    "73": {"name": "数据存储", "parent": "2"},
-    "414": {"name": "计算机软件", "parent": "1"},
-    "415": {"name": "计算机硬件", "parent": "1"},
-    "416": {"name": "计算机服务(系统、数据服务、维修)", "parent": "1"},
-    "417": {"name": "通信/电信/网络设备", "parent": "1"},
-    "418": {"name": "通信/电信运营、增值服务", "parent": "1"},
-    "419": {"name": "电子技术/半导体/集成电路", "parent": "1"},
-    "472": {"name": "P2P网贷", "parent": "4"},
-    "473": {"name": "互联网理财", "parent": "4"},
-    "474": {"name": "婚恋", "parent": "6"},
-    "476": {"name": "虚拟化", "parent": "10"},
-    "477": {"name": "邮箱", "parent": "12"},
-    "478": {"name": "商业智能", "parent": "14"},
-    "479": {"name": "企业建站", "parent": "14"},
-    "480": {"name": "安防", "parent": "14"},
-    "481": {"name": "网络营销", "parent": "2"},
-    "487": {"name": "智能终端", "parent": "2"},
-    "488": {"name": "移动互联网", "parent": "2"},
-    "489": {"name": "数字城市", "parent": "2"},
-    "490": {"name": "大数据", "parent": "2"},
-    "491": {"name": "互联网人力资源", "parent": "2"},
-    "492": {"name": "舆情监控", "parent": "2"},
-    "493": {"name": "移动营销", "parent": "481"},
-    "494": {"name": "微博营销", "parent": "481"},
-    "495": {"name": "精准营销", "parent": "481"},
-    "496": {"name": "海外营销", "parent": "481"},
-    "497": {"name": "微信营销", "parent": "481"},
-    "498": {"name": "智能手机", "parent": "487"},
-    "499": {"name": "可穿戴设备", "parent": "487"},
-    "500": {"name": "智能电视", "parent": "487"},
-    "501": {"name": "WAP", "parent": "488"},
-    "502": {"name": "物联网", "parent": "489"},
-    "503": {"name": "O2O", "parent": "489"},
-    "504": {"name": "数字出版", "parent": "489"},
-    "505": {"name": "搜索", "parent": "2"},
-    "506": {"name": "垂直搜索", "parent": "505"},
-    "507": {"name": "无线搜索", "parent": "505"},
-    "508": {"name": "网页搜索", "parent": "505"},
-    "509": {"name": "网址导航", "parent": "2"},
-    "510": {"name": "门户", "parent": "2"},
-    "511": {"name": "网络文学", "parent": "2"},
-    "512": {"name": "自媒体", "parent": "2"},
-    "513": {"name": "金融", "parent": "0"},
-    "514": {"name": "建筑与房地产", "parent": "0"},
-    "515": {"name": "专业服务", "parent": "0"},
-    "516": {"name": "教育培训", "parent": "0"},
-    "517": {"name": "文化传媒", "parent": "0"},
-    "518": {"name": "消费品", "parent": "0"},
-    "519": {"name": "工业", "parent": "0"},
-    "520": {"name": "交通物流", "parent": "0"},
-    "521": {"name": "贸易", "parent": "0"},
-    "522": {"name": "医药", "parent": "0"},
-    "523": {"name": "医疗器械", "parent": "522"},
-    "524": {"name": "保健品", "parent": "518"},
-    "525": {"name": "服务业", "parent": "0"},
-    "526": {"name": "能源/矿产/环保", "parent": "0"},
-    "527": {"name": "化工", "parent": "0"},
-    "528": {"name": "政府", "parent": "0"},
-    "529": {"name": "公共事业", "parent": "0"},
-    "530": {"name": "非盈利机构", "parent": "0"},
-    "531": {"name": "农业", "parent": "1131"},
-    "532": {"name": "林业", "parent": "1131"},
-    "533": {"name": "畜牧业", "parent": "1131"},
-    "534": {"name": "渔业", "parent": "1131"},
-    "535": {"name": "学术科研", "parent": "0"},
-    "536": {"name": "零售", "parent": "0"},
-    "537": {"name": "银行", "parent": "513"},
-    "538": {"name": "保险", "parent": "513"},
-    "539": {"name": "证券", "parent": "513"},
-    "540": {"name": "基金", "parent": "513"},
-    "541": {"name": "信托", "parent": "513"},
-    "542": {"name": "担保", "parent": "513"},
-    "543": {"name": "典当", "parent": "513"},
-    "544": {"name": "拍卖", "parent": "513"},
-    "545": {"name": "投资/融资", "parent": "513"},
-    "546": {"name": "期货", "parent": "513"},
-    "547": {"name": "房地产开发", "parent": "514"},
-    "548": {"name": "工程施工", "parent": "514"},
-    "549": {"name": "建筑设计", "parent": "514"},
-    "550": {"name": "房地产代理", "parent": "514"},
-    "551": {"name": "物业管理", "parent": "514"},
-    "552": {"name": "室内设计", "parent": "514"},
-    "553": {"name": "装修装潢", "parent": "514"},
-    "554": {"name": "市政工程", "parent": "514"},
-    "555": {"name": "工程造价", "parent": "514"},
-    "556": {"name": "工程监理", "parent": "514"},
-    "557": {"name": "环境工程", "parent": "514"},
-    "558": {"name": "园林景观", "parent": "514"},
-    "559": {"name": "法律", "parent": "515"},
-    "560": {"name": "人力资源", "parent": "515"},
-    "561": {"name": "会计", "parent": "1125"},
-    "562": {"name": "审计", "parent": "515"},
-    "563": {"name": "检测认证", "parent": "515"},
-    "565": {"name": "翻译", "parent": "515"},
-    "566": {"name": "中介", "parent": "515"},
-    "567": {"name": "咨询", "parent": "515"},
-    "568": {"name": "外包服务", "parent": "515"},
-    "569": {"name": "家教", "parent": "516"},
-    "570": {"name": "早教", "parent": "516"},
-    "571": {"name": "职业技能培训", "parent": "516"},
-    "572": {"name": "外语培训", "parent": "516"},
-    "573": {"name": "设计培训", "parent": "516"},
-    "574": {"name": "IT培训", "parent": "516"},
-    "575": {"name": "文艺体育培训", "parent": "516"},
-    "576": {"name": "学历教育", "parent": "516"},
-    "577": {"name": "管理培训", "parent": "516"},
-    "578": {"name": "民办基础教育", "parent": "516"},
-    "579": {"name": "广告", "parent": "517"},
-    "580": {"name": "媒体", "parent": "517"},
-    "581": {"name": "会展", "parent": "517"},
-    "582": {"name": "公关", "parent": "517"},
-    "583": {"name": "影视", "parent": "517"},
-    "584": {"name": "艺术", "parent": "517"},
-    "585": {"name": "文化传播", "parent": "517"},
-    "586": {"name": "娱乐", "parent": "517"},
-    "587": {"name": "体育", "parent": "517"},
-    "588": {"name": "出版", "parent": "517"},
-    "589": {"name": "休闲", "parent": "517"},
-    "590": {"name": "动漫", "parent": "517"},
-    "591": {"name": "市场推广", "parent": "517"},
-    "592": {"name": "市场研究", "parent": "517"},
-    "593": {"name": "食品", "parent": "1129"},
-    "594": {"name": "饮料", "parent": "1129"},
-    "595": {"name": "烟草", "parent": "1129"},
-    "596": {"name": "酒品", "parent": "518"},
-    "597": {"name": "服饰", "parent": "518"},
-    "598": {"name": "纺织", "parent": "518"},
-    "599": {"name": "化妆品", "parent": "1129"},
-    "600": {"name": "日用品", "parent": "1129"},
-    "601": {"name": "家电", "parent": "518"},
-    "602": {"name": "家具", "parent": "518"},
-    "603": {"name": "办公用品", "parent": "518"},
-    "604": {"name": "奢侈品", "parent": "518"},
-    "605": {"name": "珠宝", "parent": "518"},
-    "606": {"name": "数码产品", "parent": "518"},
-    "607": {"name": "玩具", "parent": "518"},
-    "608": {"name": "图书", "parent": "518"},
-    "609": {"name": "音像", "parent": "518"},
-    "610": {"name": "钟表", "parent": "518"},
-    "611": {"name": "箱包", "parent": "518"},
-    "612": {"name": "母婴", "parent": "518"},
-    "613": {"name": "营养保健", "parent": "518"},
-    "614": {"name": "户外用品", "parent": "518"},
-    "615": {"name": "健身器材", "parent": "518"},
-    "616": {"name": "乐器", "parent": "518"},
-    "617": {"name": "汽车用品", "parent": "518"},
-    "619": {"name": "厨具", "parent": "518"},
-    "620": {"name": "机械制造", "parent": "519"},
-    "621": {"name": "流体控制", "parent": "519"},
-    "622": {"name": "自动化控制", "parent": "519"},
-    "623": {"name": "仪器仪表", "parent": "519"},
-    "624": {"name": "航空/航天", "parent": "519"},
-    "625": {"name": "交通设施", "parent": "519"},
-    "626": {"name": "工业电子", "parent": "519"},
-    "627": {"name": "建材", "parent": "519"},
-    "628": {"name": "五金材料", "parent": "519"},
-    "629": {"name": "汽车", "parent": "519"},
-    "630": {"name": "印刷", "parent": "519"},
-    "631": {"name": "造纸", "parent": "519"},
-    "632": {"name": "包装", "parent": "519"},
-    "633": {"name": "原材料及加工", "parent": "519"},
-    "634": {"name": "物流", "parent": "520"},
-    "635": {"name": "仓储", "parent": "520"},
-    "636": {"name": "客运", "parent": "520"},
-    "637": {"name": "快递", "parent": "520"},
-    "638": {"name": "化学药", "parent": "522"},
-    "639": {"name": "中药", "parent": "522"},
-    "640": {"name": "生物制药", "parent": "522"},
-    "641": {"name": "兽药", "parent": "522"},
-    "642": {"name": "农药", "parent": "522"},
-    "643": {"name": "CRO", "parent": "522"},
-    "644": {"name": "消毒", "parent": "522"},
-    "645": {"name": "医药商业", "parent": "522"},
-    "646": {"name": "医疗服务", "parent": "522"},
-    "647": {"name": "医疗器械", "parent": "523"},
-    "648": {"name": "制药设备", "parent": "523"},
-    "649": {"name": "医用耗材", "parent": "523"},
-    "650": {"name": "手术器械", "parent": "523"},
-    "651": {"name": "保健器材", "parent": "524"},
-    "652": {"name": "性保健品", "parent": "524"},
-    "653": {"name": "医药保养", "parent": "524"},
-    "654": {"name": "医用保健", "parent": "524"},
-    "655": {"name": "酒店", "parent": "525"},
-    "656": {"name": "餐饮", "parent": "525"},
-    "657": {"name": "旅游", "parent": "525"},
-    "658": {"name": "生活服务", "parent": "525"},
-    "659": {"name": "保健服务", "parent": "525"},
-    "660": {"name": "运动健身", "parent": "525"},
-    "661": {"name": "家政服务", "parent": "525"},
-    "662": {"name": "婚庆服务", "parent": "525"},
-    "663": {"name": "租赁服务", "parent": "525"},
-    "664": {"name": "维修服务", "parent": "525"},
-    "665": {"name": "石油天然气", "parent": "526"},
-    "666": {"name": "电力", "parent": "526"},
-    "667": {"name": "新能源", "parent": "526"},
-    "668": {"name": "水利", "parent": "526"},
-    "669": {"name": "矿产", "parent": "526"},
-    "670": {"name": "采掘业", "parent": "526"},
-    "671": {"name": "冶炼", "parent": "526"},
-    "672": {"name": "环保", "parent": "526"},
-    "673": {"name": "无机化工原料", "parent": "527"},
-    "674": {"name": "有机化工原料", "parent": "527"},
-    "675": {"name": "精细化学品", "parent": "527"},
-    "676": {"name": "化工设备", "parent": "527"},
-    "677": {"name": "化工工程", "parent": "527"},
-    "678": {"name": "资产管理", "parent": "513"},
-    "679": {"name": "金融租赁", "parent": "513"},
-    "680": {"name": "征信及信评机构", "parent": "513"},
-    "681": {"name": "资产评估机构", "parent": "513"},
-    "683": {"name": "金融监管机构", "parent": "513"},
-    "684": {"name": "国际贸易", "parent": "521"},
-    "685": {"name": "海关", "parent": "521"},
-    "686": {"name": "购物中心", "parent": "536"},
-    "687": {"name": "超市", "parent": "536"},
-    "688": {"name": "便利店", "parent": "536"},
-    "689": {"name": "专卖店", "parent": "536"},
-    "690": {"name": "专业店", "parent": "536"},
-    "691": {"name": "百货店", "parent": "536"},
-    "692": {"name": "杂货店", "parent": "536"},
-    "693": {"name": "个人银行", "parent": "537"},
-    "695": {"name": "私人银行", "parent": "537"},
-    "696": {"name": "公司银行", "parent": "537"},
-    "697": {"name": "投资银行", "parent": "537"},
-    "698": {"name": "政策性银行", "parent": "537"},
-    "699": {"name": "中央银行", "parent": "537"},
-    "700": {"name": "人寿险", "parent": "538"},
-    "701": {"name": "财产险", "parent": "538"},
-    "702": {"name": "再保险", "parent": "538"},
-    "703": {"name": "养老险", "parent": "538"},
-    "704": {"name": "保险代理公司", "parent": "538"},
-    "705": {"name": "公募基金", "parent": "540"},
-    "707": {"name": "私募基金", "parent": "540"},
-    "708": {"name": "第三方理财", "parent": "679"},
-    "709": {"name": "资产管理公司", "parent": "679"},
-    "711": {"name": "房产中介", "parent": "566"},
-    "712": {"name": "职业中介", "parent": "566"},
-    "713": {"name": "婚姻中介", "parent": "566"},
-    "714": {"name": "战略咨询", "parent": "567"},
-    "715": {"name": "投资咨询", "parent": "567"},
-    "716": {"name": "心理咨询", "parent": "567"},
-    "717": {"name": "留学移民咨询", "parent": "567"},
-    "718": {"name": "工商注册代理", "parent": "568"},
-    "719": {"name": "商标专利代理", "parent": "568"},
-    "720": {"name": "财务代理", "parent": "568"},
-    "721": {"name": "工程机械", "parent": "620"},
-    "722": {"name": "农业机械", "parent": "620"},
-    "723": {"name": "海工设备", "parent": "620"},
-    "724": {"name": "包装机械", "parent": "620"},
-    "725": {"name": "印刷机械", "parent": "620"},
-    "726": {"name": "数控机床", "parent": "620"},
-    "727": {"name": "矿山机械", "parent": "620"},
-    "728": {"name": "水泵", "parent": "621"},
-    "729": {"name": "管道", "parent": "621"},
-    "730": {"name": "阀门", "parent": "621"},
-    "732": {"name": "压缩机", "parent": "621"},
-    "733": {"name": "集散控制系统", "parent": "622"},
-    "734": {"name": "远程控制", "parent": "622"},
-    "735": {"name": "液压系统", "parent": "622"},
-    "736": {"name": "楼宇智能化", "parent": "622"},
-    "737": {"name": "飞机制造", "parent": "624"},
-    "738": {"name": "航空公司", "parent": "624"},
-    "739": {"name": "发动机", "parent": "624"},
-    "740": {"name": "复合材料", "parent": "624"},
-    "741": {"name": "高铁", "parent": "625"},
-    "742": {"name": "地铁", "parent": "625"},
-    "743": {"name": "信号传输", "parent": "625"},
-    "745": {"name": "结构材料", "parent": "627"},
-    "746": {"name": "装饰材料", "parent": "627"},
-    "747": {"name": "专用材料", "parent": "627"},
-    "749": {"name": "经销商集团", "parent": "629"},
-    "750": {"name": "整车制造", "parent": "629"},
-    "751": {"name": "汽车零配件", "parent": "629"},
-    "752": {"name": "外型设计", "parent": "629"},
-    "753": {"name": "平版印刷", "parent": "630"},
-    "754": {"name": "凸版印刷", "parent": "630"},
-    "755": {"name": "凹版印刷", "parent": "630"},
-    "756": {"name": "孔版印刷", "parent": "630"},
-    "757": {"name": "印刷用纸", "parent": "631"},
-    "758": {"name": "书写、制图及复制用纸", "parent": "631"},
-    "759": {"name": "包装用纸", "parent": "631"},
-    "760": {"name": "生活、卫生及装饰用纸", "parent": "631"},
-    "761": {"name": "技术用纸", "parent": "631"},
-    "762": {"name": "加工纸原纸", "parent": "631"},
-    "763": {"name": "食品包装", "parent": "632"},
-    "764": {"name": "医药包装", "parent": "632"},
-    "765": {"name": "日化包装", "parent": "632"},
-    "766": {"name": "物流包装", "parent": "632"},
-    "767": {"name": "礼品包装", "parent": "632"},
-    "768": {"name": "电子五金包装", "parent": "632"},
-    "769": {"name": "汽车服务", "parent": "525"},
-    "770": {"name": "汽车保养", "parent": "769"},
-    "771": {"name": "租车", "parent": "769"},
-    "773": {"name": "出租车", "parent": "769"},
-    "774": {"name": "代驾", "parent": "769"},
-    "775": {"name": "发电", "parent": "666"},
-    "777": {"name": "输配电", "parent": "666"},
-    "779": {"name": "风电", "parent": "667"},
-    "780": {"name": "光伏/太阳能", "parent": "667"},
-    "781": {"name": "生物质发电", "parent": "667"},
-    "782": {"name": "煤化工", "parent": "667"},
-    "783": {"name": "垃圾发电", "parent": "667"},
-    "784": {"name": "核电", "parent": "667"},
-    "785": {"name": "能源矿产", "parent": "669"},
-    "786": {"name": "金属矿产", "parent": "669"},
-    "787": {"name": "非金属矿产", "parent": "669"},
-    "788": {"name": "水气矿产", "parent": "669"},
-    "789": {"name": "锅炉", "parent": "775"},
-    "790": {"name": "发电机", "parent": "775"},
-    "791": {"name": "汽轮机", "parent": "775"},
-    "792": {"name": "燃机", "parent": "775"},
-    "793": {"name": "冷却", "parent": "775"},
-    "794": {"name": "电力设计院", "parent": "775"},
-    "795": {"name": "高压输配电", "parent": "777"},
-    "796": {"name": "中压输配电", "parent": "777"},
-    "797": {"name": "低压输配电", "parent": "777"},
-    "798": {"name": "继电保护", "parent": "777"},
-    "799": {"name": "智能电网", "parent": "777"},
-    "800": {"name": "小学", "parent": "516"},
-    "801": {"name": "电动车", "parent": "519"},
-    "802": {"name": "皮具箱包", "parent": "518"},
-    "803": {"name": "医药制造", "parent": "522"},
-    "804": {"name": "电器销售", "parent": "536"},
-    "805": {"name": "塑料制品", "parent": "527"},
-    "806": {"name": "公益基金会", "parent": "530"},
-    "807": {"name": "美发服务", "parent": "525"},
-    "808": {"name": "农业养殖", "parent": "531"},
-    "809": {"name": "金融服务", "parent": "513"},
-    "810": {"name": "商业地产综合体", "parent": "514"},
-    "811": {"name": "美容服务", "parent": "525"},
-    "812": {"name": "灯饰", "parent": "518"},
-    "813": {"name": "油墨颜料产品", "parent": "527"},
-    "814": {"name": "眼镜制造", "parent": "518"},
-    "815": {"name": "农业生物技术", "parent": "531"},
-    "816": {"name": "体育用品", "parent": "518"},
-    "817": {"name": "保健用品", "parent": "524"},
-    "818": {"name": "化学化工产品", "parent": "527"},
-    "819": {"name": "饲料", "parent": "531"},
-    "821": {"name": "保安服务", "parent": "525"},
-    "822": {"name": "干细胞技术", "parent": "522"},
-    "824": {"name": "农药化肥", "parent": "527"},
-    "825": {"name": "卫生洁具", "parent": "518"},
-    "826": {"name": "体育器材、场馆", "parent": "518"},
-    "827": {"name": "饲料加工", "parent": "531"},
-    "828": {"name": "测绘服务", "parent": "529"},
-    "830": {"name": "金属船舶制造", "parent": "519"},
-    "831": {"name": "基因工程", "parent": "522"},
-    "832": {"name": "花卉服务", "parent": "536"},
-    "833": {"name": "农业种植", "parent": "531"},
-    "834": {"name": "皮革制品", "parent": "518"},
-    "835": {"name": "地理信息加工服务", "parent": "529"},
-    "836": {"name": "机器人", "parent": "519"},
-    "837": {"name": "礼品", "parent": "518"},
-    "838": {"name": "理发及美容服务", "parent": "525"},
-    "839": {"name": "其他清洁服务", "parent": "525"},
-    "840": {"name": "硅胶材料", "parent": "527"},
-    "841": {"name": "茶叶销售", "parent": "518"},
-    "842": {"name": "彩票活动", "parent": "529"},
-    "843": {"name": "化妆培训", "parent": "516"},
-    "844": {"name": "鞋业", "parent": "518"},
-    "845": {"name": "酒店用品", "parent": "518"},
-    "846": {"name": "复合材料", "parent": "527"},
-    "847": {"name": "房地产工程建设", "parent": "548"},
-    "848": {"name": "知识产权服务", "parent": "559"},
-    "849": {"name": "新型建材", "parent": "627"},
-    "850": {"name": "企业投资咨询", "parent": "567"},
-    "851": {"name": "含乳饮料和植物蛋白饮料制造", "parent": "594"},
-    "852": {"name": "汽车检测设备", "parent": "629"},
-    "853": {"name": "手机通讯器材", "parent": "417"},
-    "854": {"name": "环保材料", "parent": "672"},
-    "855": {"name": "交通设施", "parent": "554"},
-    "856": {"name": "电子器件", "parent": "419"},
-    "857": {"name": "啤酒", "parent": "594"},
-    "858": {"name": "生态旅游", "parent": "657"},
-    "859": {"name": "自动化设备", "parent": "626"},
-    "860": {"name": "软件开发", "parent": "414"},
-    "861": {"name": "葡萄酒销售", "parent": "594"},
-    "862": {"name": "钢材", "parent": "633"},
-    "863": {"name": "餐饮培训", "parent": "656"},
-    "864": {"name": "速冻食品", "parent": "593"},
-    "865": {"name": "空气环保", "parent": "672"},
-    "866": {"name": "互联网房地产经纪服务", "parent": "550"},
-    "867": {"name": "食品添加剂", "parent": "593"},
-    "868": {"name": "演艺传播", "parent": "585"},
-    "869": {"name": "信用卡", "parent": "537"},
-    "870": {"name": "报纸期刊广告", "parent": "579"},
-    "871": {"name": "摄影", "parent": "525"},
-    "872": {"name": "手机软件", "parent": "414"},
-    "873": {"name": "地坪建材", "parent": "627"},
-    "874": {"name": "企业管理咨询", "parent": "567"},
-    "875": {"name": "幼儿教育", "parent": "570"},
-    "876": {"name": "系统集成", "parent": "416"},
-    "877": {"name": "皮革服饰", "parent": "597"},
-    "878": {"name": "保健食品", "parent": "593"},
-    "879": {"name": "叉车", "parent": "620"},
-    "880": {"name": "厨卫电器", "parent": "601"},
-    "882": {"name": "地暖设备", "parent": "627"},
-    "883": {"name": "钢结构制造", "parent": "548"},
-    "884": {"name": "投影机", "parent": "606"},
-    "885": {"name": "啤酒销售", "parent": "594"},
-    "886": {"name": "度假村旅游", "parent": "657"},
-    "887": {"name": "电力元件设备", "parent": "626"},
-    "888": {"name": "管理软件", "parent": "414"},
-    "889": {"name": "轴承", "parent": "628"},
-    "890": {"name": "餐饮设备", "parent": "656"},
-    "891": {"name": "肉制品及副产品加工", "parent": "593"},
-    "892": {"name": "艺术收藏品投资交易", "parent": "584"},
-    "893": {"name": "净水器", "parent": "601"},
-    "894": {"name": "进口食品", "parent": "593"},
-    "895": {"name": "娱乐文化传播", "parent": "585"},
-    "896": {"name": "文化传播", "parent": "585"},
-    "897": {"name": "商旅传媒", "parent": "580"},
-    "898": {"name": "广告设计制作", "parent": "579"},
-    "899": {"name": "金属丝绳及其制品制造", "parent": "627"},
-    "900": {"name": "建筑涂料", "parent": "627"},
-    "901": {"name": "抵押贷款", "parent": "543"},
-    "902": {"name": "早教", "parent": "570"},
-    "903": {"name": "电影放映", "parent": "583"},
-    "904": {"name": "内衣服饰", "parent": "597"},
-    "905": {"name": "无线网络通信", "parent": "418"},
-    "906": {"name": "记忆卡", "parent": "415"},
-    "907": {"name": "女装服饰", "parent": "597"},
-    "908": {"name": "建筑机械", "parent": "620"},
-    "909": {"name": "制冷电器", "parent": "601"},
-    "910": {"name": "通信设备", "parent": "417"},
-    "911": {"name": "空调设备", "parent": "601"},
-    "912": {"name": "建筑装饰", "parent": "553"},
-    "913": {"name": "办公设备", "parent": "603"},
-    "916": {"name": "数据处理软件", "parent": "414"},
-    "917": {"name": "葡萄酒贸易", "parent": "594"},
-    "918": {"name": "通讯器材", "parent": "417"},
-    "919": {"name": "铜业", "parent": "633"},
-    "920": {"name": "食堂", "parent": "656"},
-    "921": {"name": "糖果零食", "parent": "593"},
-    "922": {"name": "文化艺术传播", "parent": "584"},
-    "923": {"name": "太阳能电器", "parent": "601"},
-    "924": {"name": "药品零售", "parent": "645"},
-    "925": {"name": "果蔬食品", "parent": "593"},
-    "926": {"name": "文化活动策划", "parent": "585"},
-    "928": {"name": "汽车广告", "parent": "657"},
-    "929": {"name": "条码设备", "parent": "630"},
-    "930": {"name": "建筑石材", "parent": "627"},
-    "931": {"name": "贵金属", "parent": "545"},
-    "932": {"name": "体育", "parent": "660"},
-    "933": {"name": "金融信息服务", "parent": "414"},
-    "934": {"name": "玻璃建材", "parent": "627"},
-    "935": {"name": "家教", "parent": "569"},
-    "936": {"name": "歌舞厅娱乐活动", "parent": "586"},
-    "937": {"name": "计算机服务器", "parent": "415"},
-    "938": {"name": "管道", "parent": "627"},
-    "939": {"name": "婴幼儿服饰", "parent": "597"},
-    "940": {"name": "热水器", "parent": "601"},
-    "941": {"name": "计算机及零部件制造", "parent": "415"},
-    "942": {"name": "钢铁贸易", "parent": "633"},
-    "944": {"name": "包装材料", "parent": "632"},
-    "945": {"name": "计算机办公设备", "parent": "603"},
-    "946": {"name": "白酒", "parent": "594"},
-    "948": {"name": "发动机", "parent": "620"},
-    "949": {"name": "快餐服务", "parent": "656"},
-    "950": {"name": "酒类销售", "parent": "594"},
-    "951": {"name": "电子产品、机电设备", "parent": "626"},
-    "952": {"name": "激光设备", "parent": "626"},
-    "953": {"name": "餐饮策划", "parent": "656"},
-    "954": {"name": "饮料、食品", "parent": "594"},
-    "955": {"name": "文化娱乐经纪", "parent": "585"},
-    "956": {"name": "天然气", "parent": "665"},
-    "957": {"name": "农副食品", "parent": "593"},
-    "958": {"name": "艺术表演", "parent": "585"},
-    "959": {"name": "石膏、水泥制品及类似制品制造", "parent": "627"},
-    "960": {"name": "橱柜", "parent": "602"},
-    "961": {"name": "管理培训", "parent": "577"},
-    "962": {"name": "男装服饰", "parent": "597"},
-    "963": {"name": "化肥制造", "parent": "675"},
-    "964": {"name": "童装服饰", "parent": "597"},
-    "965": {"name": "电源电池", "parent": "626"},
-    "966": {"name": "家电维修", "parent": "664"},
-    "967": {"name": "光电子器件", "parent": "419"},
-    "968": {"name": "旅行社服务", "parent": "657"},
-    "969": {"name": "电线、电缆制造", "parent": "626"},
-    "970": {"name": "软件开发、信息系统集成", "parent": "419"},
-    "971": {"name": "白酒制造", "parent": "594"},
-    "973": {"name": "甜品服务", "parent": "656"},
-    "974": {"name": "糕点、面包制造", "parent": "593"},
-    "975": {"name": "木工机械", "parent": "620"},
-    "976": {"name": "酒吧服务", "parent": "656"},
-    "977": {"name": "火腿肠", "parent": "593"},
-    "978": {"name": "广告策划推广", "parent": "579"},
-    "979": {"name": "新能源产品和生产装备制造", "parent": "667"},
-    "980": {"name": "调味品", "parent": "593"},
-    "981": {"name": "礼仪表演", "parent": "585"},
-    "982": {"name": "劳务派遣", "parent": "560"},
-    "983": {"name": "建材零售", "parent": "627"},
-    "984": {"name": "商品交易中心", "parent": "545"},
-    "985": {"name": "体育推广", "parent": "585"},
-    "986": {"name": "茶饮料及其他饮料制造", "parent": "594"},
-    "987": {"name": "金属建材", "parent": "627"},
-    "988": {"name": "职业技能培训", "parent": "571"},
-    "989": {"name": "网吧活动", "parent": "586"},
-    "990": {"name": "洗衣服务", "parent": "658"},
-    "991": {"name": "管道工程", "parent": "554"},
-    "992": {"name": "通信工程", "parent": "417"},
-    "993": {"name": "电子元器件", "parent": "626"},
-    "994": {"name": "电子设备", "parent": "419"},
-    "995": {"name": "茶馆服务", "parent": "656"},
-    "996": {"name": "旅游开发", "parent": "657"},
-    "997": {"name": "视频通讯", "parent": "417"},
-    "998": {"name": "白酒销售", "parent": "594"},
-    "1000": {"name": "咖啡馆服务", "parent": "656"},
-    "1001": {"name": "食品零售", "parent": "593"},
-    "1002": {"name": "健康疗养旅游", "parent": "655"},
-    "1003": {"name": "粮油食品", "parent": "593"},
-    "1004": {"name": "儿童教育影视", "parent": "583"},
-    "1005": {"name": "新能源发电", "parent": "667"},
-    "1006": {"name": "旅游策划", "parent": "657"},
-    "1007": {"name": "绘画", "parent": "575"},
-    "1008": {"name": "方便面及其他方便食品", "parent": "593"},
-    "1009": {"name": "房地产经纪", "parent": "550"},
-    "1010": {"name": "母婴家政", "parent": "661"},
-    "1011": {"name": "居家养老健康服务", "parent": "661"},
-    "1012": {"name": "文化艺术投资", "parent": "545"},
-    "1013": {"name": "运动健身", "parent": "660"},
-    "1014": {"name": "瓶（罐）装饮用水制造", "parent": "594"},
-    "1015": {"name": "金属门窗", "parent": "627"},
-    "1016": {"name": "机动车检测", "parent": "563"},
-    "1017": {"name": "货物运输", "parent": "634"},
-    "1018": {"name": "服饰专卖", "parent": "690"},
-    "1019": {"name": "酒店服装", "parent": "597"},
-    "1020": {"name": "通讯软件", "parent": "417"},
-    "1021": {"name": "消防工程", "parent": "554"},
-    "1022": {"name": "嵌入式电子系统", "parent": "419"},
-    "1023": {"name": "航空票务", "parent": "636"},
-    "1024": {"name": "电气设备", "parent": "626"},
-    "1025": {"name": "酒业贸易", "parent": "594"},
-    "1027": {"name": "其他饮料及冷饮服务", "parent": "656"},
-    "1028": {"name": "乳制品", "parent": "593"},
-    "1029": {"name": "新闻期刊出版", "parent": "588"},
-    "1030": {"name": "水污染治理", "parent": "672"},
-    "1031": {"name": "谷物食品", "parent": "593"},
-    "1032": {"name": "数字动漫设计制造服务", "parent": "590"},
-    "1033": {"name": "医院", "parent": "646"},
-    "1034": {"name": "旅游广告", "parent": "657"},
-    "1035": {"name": "办公家具", "parent": "602"},
-    "1036": {"name": "房地产营销策划", "parent": "550"},
-    "1037": {"name": "保洁家政", "parent": "661"},
-    "1038": {"name": "水泥制造", "parent": "627"},
-    "1039": {"name": "市场研究咨询", "parent": "567"},
-    "1040": {"name": "驾校", "parent": "571"},
-    "1041": {"name": "正餐服务", "parent": "656"},
-    "1043": {"name": "机动车燃油", "parent": "665"},
-    "1044": {"name": "食品", "parent": "593"},
-    "1045": {"name": "新能源汽车", "parent": "629"},
-    "1046": {"name": "手机无线网络推广", "parent": "417"},
-    "1047": {"name": "环保设备", "parent": "672"},
-    "1048": {"name": "通讯工程", "parent": "418"},
-    "1049": {"name": "半导体集成电路", "parent": "419"},
-    "1050": {"name": "航空服务", "parent": "636"},
-    "1051": {"name": "电机设备", "parent": "626"},
-    "1052": {"name": "档案软件", "parent": "414"},
-    "1053": {"name": "冷链物流服务", "parent": "634"},
-    "1054": {"name": "小吃服务", "parent": "656"},
-    "1055": {"name": "水产品加工", "parent": "593"},
-    "1056": {"name": "图书出版", "parent": "588"},
-    "1057": {"name": "固体废物治理", "parent": "672"},
-    "1059": {"name": "坚果食品", "parent": "593"},
-    "1060": {"name": "广告传媒", "parent": "579"},
-    "1061": {"name": "电梯", "parent": "622"},
-    "1062": {"name": "社区医疗与卫生院", "parent": "646"},
-    "1063": {"name": "广告、印刷包装", "parent": "630"},
-    "1064": {"name": "婚纱礼服", "parent": "662"},
-    "1065": {"name": "地毯", "parent": "602"},
-    "1066": {"name": "互联网物业", "parent": "551"},
-    "1067": {"name": "跨境电商", "parent": "3"},
-    "1068": {"name": "信息安全、系统集成", "parent": "9"},
-    "1069": {"name": "专用汽车制造", "parent": "750"},
-    "1070": {"name": "商品贸易", "parent": "3"},
-    "1071": {"name": "墙壁装饰材料", "parent": "746"},
-    "1072": {"name": "窗帘装饰材料", "parent": "746"},
-    "1073": {"name": "电子商务、本地生活服务", "parent": "3"},
-    "1075": {"name": "白酒电子商务", "parent": "3"},
-    "1076": {"name": "商品贸易、电子商务", "parent": "3"},
-    "1077": {"name": "木质装饰材料", "parent": "746"},
-    "1078": {"name": "电子商务、汽车电商交易平台", "parent": "3"},
-    "1079": {"name": "汽车轮胎", "parent": "751"},
-    "1080": {"name": "气体压缩机械制造", "parent": "732"},
-    "1081": {"name": "家装家具电子商务", "parent": "3"},
-    "1082": {"name": "化妆品电子商务", "parent": "3"},
-    "1083": {"name": "汽车销售", "parent": "749"},
-    "1084": {"name": "新闻资讯网站", "parent": "510"},
-    "1085": {"name": "母婴电商", "parent": "3"},
-    "1086": {"name": "电商商务、收藏品交易", "parent": "3"},
-    "1088": {"name": "电子商务、数码产品", "parent": "3"},
-    "1089": {"name": "二手车交易", "parent": "749"},
-    "1090": {"name": "游戏制作服务", "parent": "5"},
-    "1091": {"name": "母婴服务", "parent": "510"},
-    "1092": {"name": "家具电子商务", "parent": "3"},
-    "1093": {"name": "汽车配件电子商务", "parent": "3"},
-    "1094": {"name": "输配电设备", "parent": "777"},
-    "1095": {"name": "矿山设备", "parent": "727"},
-    "1096": {"name": "机床机械", "parent": "726"},
-    "1097": {"name": "农产品电商", "parent": "3"},
-    "1098": {"name": "陶瓷装饰材料", "parent": "746"},
-    "1099": {"name": "车载联网设备", "parent": "487"},
-    "1100": {"name": "汽车销售电子商务", "parent": "3"},
-    "1101": {"name": "石油设备", "parent": "730"},
-    "1102": {"name": "智能家居", "parent": "487"},
-    "1103": {"name": "散热器", "parent": "751"},
-    "1104": {"name": "电力工程", "parent": "775"},
-    "1105": {"name": "生鲜电商", "parent": "3"},
-    "1106": {"name": "互联网数据服务", "parent": "490"},
-    "1107": {"name": "房车、商务车销售", "parent": "749"},
-    "1108": {"name": "茶叶电子商务", "parent": "3"},
-    "1109": {"name": "酒类电子商务", "parent": "3"},
-    "1110": {"name": "阀门", "parent": "730"},
-    "1111": {"name": "食品电商", "parent": "3"},
-    "1112": {"name": "儿童摄影", "parent": "871"},
-    "1113": {"name": "广告摄影", "parent": "871"},
-    "1114": {"name": "婚纱摄影", "parent": "871"},
-    "1115": {"name": "模具制造", "parent": "620"},
-    "1116": {"name": "汽车模具", "parent": "629"},
-    "1117": {"name": "认证咨询", "parent": "567"},
-    "1118": {"name": "数字视觉制作服务", "parent": "590"},
-    "1119": {"name": "牙科及医疗器械", "parent": "646"},
-    "1120": {"name": "猎头招聘", "parent": "560"},
-    "1121": {"name": "家居", "parent": "518"},
-    "1122": {"name": "收藏品", "parent": "518"},
-    "1123": {"name": "首饰", "parent": "518"},
-    "1124": {"name": "工艺品", "parent": "518"},
-    "1125": {"name": "财务", "parent": "515"},
-    "1126": {"name": "税务", "parent": "515"},
-    "1127": {"name": "分类信息", "parent": "2"},
-    "1128": {"name": "宠物", "parent": "0"},
-    "1129": {"name": "快消品", "parent": "518"},
-    "1130": {"name": "人工智能", "parent": "2"},
-    "1131": {"name": "农/林/牧/渔", "parent": "0"},
-}
-
-
-def get_names(id):
-    id = str(id)
-    nms = []
-    d = TBL.get(id)
-    if not d:
-        return []
-    nms.append(d["name"])
-    p = get_names(d["parent"])
-    if p:
-        nms.extend(p)
-    return nms
-
-
-if __name__ == "__main__":
-    print(get_names("1119"))
--- a/deepdoc/parser/resume/entities/regions.py
+++ b/deepdoc/parser/resume/entities/regions.py
@ -1,789 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import re
-
-TBL = {
-    "2": {"name": "北京", "parent": "1"},
-    "3": {"name": "天津", "parent": "1"},
-    "4": {"name": "河北", "parent": "1"},
-    "5": {"name": "山西", "parent": "1"},
-    "6": {"name": "内蒙古", "parent": "1"},
-    "7": {"name": "辽宁", "parent": "1"},
-    "8": {"name": "吉林", "parent": "1"},
-    "9": {"name": "黑龙江", "parent": "1"},
-    "10": {"name": "上海", "parent": "1"},
-    "11": {"name": "江苏", "parent": "1"},
-    "12": {"name": "浙江", "parent": "1"},
-    "13": {"name": "安徽", "parent": "1"},
-    "14": {"name": "福建", "parent": "1"},
-    "15": {"name": "江西", "parent": "1"},
-    "16": {"name": "山东", "parent": "1"},
-    "17": {"name": "河南", "parent": "1"},
-    "18": {"name": "湖北", "parent": "1"},
-    "19": {"name": "湖南", "parent": "1"},
-    "20": {"name": "广东", "parent": "1"},
-    "21": {"name": "广西", "parent": "1"},
-    "22": {"name": "海南", "parent": "1"},
-    "23": {"name": "重庆", "parent": "1"},
-    "24": {"name": "四川", "parent": "1"},
-    "25": {"name": "贵州", "parent": "1"},
-    "26": {"name": "云南", "parent": "1"},
-    "27": {"name": "西藏", "parent": "1"},
-    "28": {"name": "陕西", "parent": "1"},
-    "29": {"name": "甘肃", "parent": "1"},
-    "30": {"name": "青海", "parent": "1"},
-    "31": {"name": "宁夏", "parent": "1"},
-    "32": {"name": "新疆", "parent": "1"},
-    "33": {"name": "北京市", "parent": "2"},
-    "34": {"name": "天津市", "parent": "3"},
-    "35": {"name": "石家庄市", "parent": "4"},
-    "36": {"name": "唐山市", "parent": "4"},
-    "37": {"name": "秦皇岛市", "parent": "4"},
-    "38": {"name": "邯郸市", "parent": "4"},
-    "39": {"name": "邢台市", "parent": "4"},
-    "40": {"name": "保定市", "parent": "4"},
-    "41": {"name": "张家口市", "parent": "4"},
-    "42": {"name": "承德市", "parent": "4"},
-    "43": {"name": "沧州市", "parent": "4"},
-    "44": {"name": "廊坊市", "parent": "4"},
-    "45": {"name": "衡水市", "parent": "4"},
-    "46": {"name": "太原市", "parent": "5"},
-    "47": {"name": "大同市", "parent": "5"},
-    "48": {"name": "阳泉市", "parent": "5"},
-    "49": {"name": "长治市", "parent": "5"},
-    "50": {"name": "晋城市", "parent": "5"},
-    "51": {"name": "朔州市", "parent": "5"},
-    "52": {"name": "晋中市", "parent": "5"},
-    "53": {"name": "运城市", "parent": "5"},
-    "54": {"name": "忻州市", "parent": "5"},
-    "55": {"name": "临汾市", "parent": "5"},
-    "56": {"name": "吕梁市", "parent": "5"},
-    "57": {"name": "呼和浩特市", "parent": "6"},
-    "58": {"name": "包头市", "parent": "6"},
-    "59": {"name": "乌海市", "parent": "6"},
-    "60": {"name": "赤峰市", "parent": "6"},
-    "61": {"name": "通辽市", "parent": "6"},
-    "62": {"name": "鄂尔多斯市", "parent": "6"},
-    "63": {"name": "呼伦贝尔市", "parent": "6"},
-    "64": {"name": "巴彦淖尔市", "parent": "6"},
-    "65": {"name": "乌兰察布市", "parent": "6"},
-    "66": {"name": "兴安盟", "parent": "6"},
-    "67": {"name": "锡林郭勒盟", "parent": "6"},
-    "68": {"name": "阿拉善盟", "parent": "6"},
-    "69": {"name": "沈阳市", "parent": "7"},
-    "70": {"name": "大连市", "parent": "7"},
-    "71": {"name": "鞍山市", "parent": "7"},
-    "72": {"name": "抚顺市", "parent": "7"},
-    "73": {"name": "本溪市", "parent": "7"},
-    "74": {"name": "丹东市", "parent": "7"},
-    "75": {"name": "锦州市", "parent": "7"},
-    "76": {"name": "营口市", "parent": "7"},
-    "77": {"name": "阜新市", "parent": "7"},
-    "78": {"name": "辽阳市", "parent": "7"},
-    "79": {"name": "盘锦市", "parent": "7"},
-    "80": {"name": "铁岭市", "parent": "7"},
-    "81": {"name": "朝阳市", "parent": "7"},
-    "82": {"name": "葫芦岛市", "parent": "7"},
-    "83": {"name": "长春市", "parent": "8"},
-    "84": {"name": "吉林市", "parent": "8"},
-    "85": {"name": "四平市", "parent": "8"},
-    "86": {"name": "辽源市", "parent": "8"},
-    "87": {"name": "通化市", "parent": "8"},
-    "88": {"name": "白山市", "parent": "8"},
-    "89": {"name": "松原市", "parent": "8"},
-    "90": {"name": "白城市", "parent": "8"},
-    "91": {"name": "延边朝鲜族自治州", "parent": "8"},
-    "92": {"name": "哈尔滨市", "parent": "9"},
-    "93": {"name": "齐齐哈尔市", "parent": "9"},
-    "94": {"name": "鸡西市", "parent": "9"},
-    "95": {"name": "鹤岗市", "parent": "9"},
-    "96": {"name": "双鸭山市", "parent": "9"},
-    "97": {"name": "大庆市", "parent": "9"},
-    "98": {"name": "伊春市", "parent": "9"},
-    "99": {"name": "佳木斯市", "parent": "9"},
-    "100": {"name": "七台河市", "parent": "9"},
-    "101": {"name": "牡丹江市", "parent": "9"},
-    "102": {"name": "黑河市", "parent": "9"},
-    "103": {"name": "绥化市", "parent": "9"},
-    "104": {"name": "大兴安岭地区", "parent": "9"},
-    "105": {"name": "上海市", "parent": "10"},
-    "106": {"name": "南京市", "parent": "11"},
-    "107": {"name": "无锡市", "parent": "11"},
-    "108": {"name": "徐州市", "parent": "11"},
-    "109": {"name": "常州市", "parent": "11"},
-    "110": {"name": "苏州市", "parent": "11"},
-    "111": {"name": "南通市", "parent": "11"},
-    "112": {"name": "连云港市", "parent": "11"},
-    "113": {"name": "淮安市", "parent": "11"},
-    "114": {"name": "盐城市", "parent": "11"},
-    "115": {"name": "扬州市", "parent": "11"},
-    "116": {"name": "镇江市", "parent": "11"},
-    "117": {"name": "泰州市", "parent": "11"},
-    "118": {"name": "宿迁市", "parent": "11"},
-    "119": {"name": "杭州市", "parent": "12"},
-    "120": {"name": "宁波市", "parent": "12"},
-    "121": {"name": "温州市", "parent": "12"},
-    "122": {"name": "嘉兴市", "parent": "12"},
-    "123": {"name": "湖州市", "parent": "12"},
-    "124": {"name": "绍兴市", "parent": "12"},
-    "125": {"name": "金华市", "parent": "12"},
-    "126": {"name": "衢州市", "parent": "12"},
-    "127": {"name": "舟山市", "parent": "12"},
-    "128": {"name": "台州市", "parent": "12"},
-    "129": {"name": "丽水市", "parent": "12"},
-    "130": {"name": "合肥市", "parent": "13"},
-    "131": {"name": "芜湖市", "parent": "13"},
-    "132": {"name": "蚌埠市", "parent": "13"},
-    "133": {"name": "淮南市", "parent": "13"},
-    "134": {"name": "马鞍山市", "parent": "13"},
-    "135": {"name": "淮北市", "parent": "13"},
-    "136": {"name": "铜陵市", "parent": "13"},
-    "137": {"name": "安庆市", "parent": "13"},
-    "138": {"name": "黄山市", "parent": "13"},
-    "139": {"name": "滁州市", "parent": "13"},
-    "140": {"name": "阜阳市", "parent": "13"},
-    "141": {"name": "宿州市", "parent": "13"},
-    "143": {"name": "六安市", "parent": "13"},
-    "144": {"name": "亳州市", "parent": "13"},
-    "145": {"name": "池州市", "parent": "13"},
-    "146": {"name": "宣城市", "parent": "13"},
-    "147": {"name": "福州市", "parent": "14"},
-    "148": {"name": "厦门市", "parent": "14"},
-    "149": {"name": "莆田市", "parent": "14"},
-    "150": {"name": "三明市", "parent": "14"},
-    "151": {"name": "泉州市", "parent": "14"},
-    "152": {"name": "漳州市", "parent": "14"},
-    "153": {"name": "南平市", "parent": "14"},
-    "154": {"name": "龙岩市", "parent": "14"},
-    "155": {"name": "宁德市", "parent": "14"},
-    "156": {"name": "南昌市", "parent": "15"},
-    "157": {"name": "景德镇市", "parent": "15"},
-    "158": {"name": "萍乡市", "parent": "15"},
-    "159": {"name": "九江市", "parent": "15"},
-    "160": {"name": "新余市", "parent": "15"},
-    "161": {"name": "鹰潭市", "parent": "15"},
-    "162": {"name": "赣州市", "parent": "15"},
-    "163": {"name": "吉安市", "parent": "15"},
-    "164": {"name": "宜春市", "parent": "15"},
-    "165": {"name": "抚州市", "parent": "15"},
-    "166": {"name": "上饶市", "parent": "15"},
-    "167": {"name": "济南市", "parent": "16"},
-    "168": {"name": "青岛市", "parent": "16"},
-    "169": {"name": "淄博市", "parent": "16"},
-    "170": {"name": "枣庄市", "parent": "16"},
-    "171": {"name": "东营市", "parent": "16"},
-    "172": {"name": "烟台市", "parent": "16"},
-    "173": {"name": "潍坊市", "parent": "16"},
-    "174": {"name": "济宁市", "parent": "16"},
-    "175": {"name": "泰安市", "parent": "16"},
-    "176": {"name": "威海市", "parent": "16"},
-    "177": {"name": "日照市", "parent": "16"},
-    "179": {"name": "临沂市", "parent": "16"},
-    "180": {"name": "德州市", "parent": "16"},
-    "181": {"name": "聊城市", "parent": "16"},
-    "182": {"name": "滨州市", "parent": "16"},
-    "183": {"name": "菏泽市", "parent": "16"},
-    "184": {"name": "郑州市", "parent": "17"},
-    "185": {"name": "开封市", "parent": "17"},
-    "186": {"name": "洛阳市", "parent": "17"},
-    "187": {"name": "平顶山市", "parent": "17"},
-    "188": {"name": "安阳市", "parent": "17"},
-    "189": {"name": "鹤壁市", "parent": "17"},
-    "190": {"name": "新乡市", "parent": "17"},
-    "191": {"name": "焦作市", "parent": "17"},
-    "192": {"name": "濮阳市", "parent": "17"},
-    "193": {"name": "许昌市", "parent": "17"},
-    "194": {"name": "漯河市", "parent": "17"},
-    "195": {"name": "三门峡市", "parent": "17"},
-    "196": {"name": "南阳市", "parent": "17"},
-    "197": {"name": "商丘市", "parent": "17"},
-    "198": {"name": "信阳市", "parent": "17"},
-    "199": {"name": "周口市", "parent": "17"},
-    "200": {"name": "驻马店市", "parent": "17"},
-    "201": {"name": "武汉市", "parent": "18"},
-    "202": {"name": "黄石市", "parent": "18"},
-    "203": {"name": "十堰市", "parent": "18"},
-    "204": {"name": "宜昌市", "parent": "18"},
-    "205": {"name": "襄阳市", "parent": "18"},
-    "206": {"name": "鄂州市", "parent": "18"},
-    "207": {"name": "荆门市", "parent": "18"},
-    "208": {"name": "孝感市", "parent": "18"},
-    "209": {"name": "荆州市", "parent": "18"},
-    "210": {"name": "黄冈市", "parent": "18"},
-    "211": {"name": "咸宁市", "parent": "18"},
-    "212": {"name": "随州市", "parent": "18"},
-    "213": {"name": "恩施土家族苗族自治州", "parent": "18"},
-    "215": {"name": "长沙市", "parent": "19"},
-    "216": {"name": "株洲市", "parent": "19"},
-    "217": {"name": "湘潭市", "parent": "19"},
-    "218": {"name": "衡阳市", "parent": "19"},
-    "219": {"name": "邵阳市", "parent": "19"},
-    "220": {"name": "岳阳市", "parent": "19"},
-    "221": {"name": "常德市", "parent": "19"},
-    "222": {"name": "张家界市", "parent": "19"},
-    "223": {"name": "益阳市", "parent": "19"},
-    "224": {"name": "郴州市", "parent": "19"},
-    "225": {"name": "永州市", "parent": "19"},
-    "226": {"name": "怀化市", "parent": "19"},
-    "227": {"name": "娄底市", "parent": "19"},
-    "228": {"name": "湘西土家族苗族自治州", "parent": "19"},
-    "229": {"name": "广州市", "parent": "20"},
-    "230": {"name": "韶关市", "parent": "20"},
-    "231": {"name": "深圳市", "parent": "20"},
-    "232": {"name": "珠海市", "parent": "20"},
-    "233": {"name": "汕头市", "parent": "20"},
-    "234": {"name": "佛山市", "parent": "20"},
-    "235": {"name": "江门市", "parent": "20"},
-    "236": {"name": "湛江市", "parent": "20"},
-    "237": {"name": "茂名市", "parent": "20"},
-    "238": {"name": "肇庆市", "parent": "20"},
-    "239": {"name": "惠州市", "parent": "20"},
-    "240": {"name": "梅州市", "parent": "20"},
-    "241": {"name": "汕尾市", "parent": "20"},
-    "242": {"name": "河源市", "parent": "20"},
-    "243": {"name": "阳江市", "parent": "20"},
-    "244": {"name": "清远市", "parent": "20"},
-    "245": {"name": "东莞市", "parent": "20"},
-    "246": {"name": "中山市", "parent": "20"},
-    "247": {"name": "潮州市", "parent": "20"},
-    "248": {"name": "揭阳市", "parent": "20"},
-    "249": {"name": "云浮市", "parent": "20"},
-    "250": {"name": "南宁市", "parent": "21"},
-    "251": {"name": "柳州市", "parent": "21"},
-    "252": {"name": "桂林市", "parent": "21"},
-    "253": {"name": "梧州市", "parent": "21"},
-    "254": {"name": "北海市", "parent": "21"},
-    "255": {"name": "防城港市", "parent": "21"},
-    "256": {"name": "钦州市", "parent": "21"},
-    "257": {"name": "贵港市", "parent": "21"},
-    "258": {"name": "玉林市", "parent": "21"},
-    "259": {"name": "百色市", "parent": "21"},
-    "260": {"name": "贺州市", "parent": "21"},
-    "261": {"name": "河池市", "parent": "21"},
-    "262": {"name": "来宾市", "parent": "21"},
-    "263": {"name": "崇左市", "parent": "21"},
-    "264": {"name": "海口市", "parent": "22"},
-    "265": {"name": "三亚市", "parent": "22"},
-    "267": {"name": "重庆市", "parent": "23"},
-    "268": {"name": "成都市", "parent": "24"},
-    "269": {"name": "自贡市", "parent": "24"},
-    "270": {"name": "攀枝花市", "parent": "24"},
-    "271": {"name": "泸州市", "parent": "24"},
-    "272": {"name": "德阳市", "parent": "24"},
-    "273": {"name": "绵阳市", "parent": "24"},
-    "274": {"name": "广元市", "parent": "24"},
-    "275": {"name": "遂宁市", "parent": "24"},
-    "276": {"name": "内江市", "parent": "24"},
-    "277": {"name": "乐山市", "parent": "24"},
-    "278": {"name": "南充市", "parent": "24"},
-    "279": {"name": "眉山市", "parent": "24"},
-    "280": {"name": "宜宾市", "parent": "24"},
-    "281": {"name": "广安市", "parent": "24"},
-    "282": {"name": "达州市", "parent": "24"},
-    "283": {"name": "雅安市", "parent": "24"},
-    "284": {"name": "巴中市", "parent": "24"},
-    "285": {"name": "资阳市", "parent": "24"},
-    "286": {"name": "阿坝藏族羌族自治州", "parent": "24"},
-    "287": {"name": "甘孜藏族自治州", "parent": "24"},
-    "288": {"name": "凉山彝族自治州", "parent": "24"},
-    "289": {"name": "贵阳市", "parent": "25"},
-    "290": {"name": "六盘水市", "parent": "25"},
-    "291": {"name": "遵义市", "parent": "25"},
-    "292": {"name": "安顺市", "parent": "25"},
-    "293": {"name": "铜仁市", "parent": "25"},
-    "294": {"name": "黔西南布依族苗族自治州", "parent": "25"},
-    "295": {"name": "毕节市", "parent": "25"},
-    "296": {"name": "黔东南苗族侗族自治州", "parent": "25"},
-    "297": {"name": "黔南布依族苗族自治州", "parent": "25"},
-    "298": {"name": "昆明市", "parent": "26"},
-    "299": {"name": "曲靖市", "parent": "26"},
-    "300": {"name": "玉溪市", "parent": "26"},
-    "301": {"name": "保山市", "parent": "26"},
-    "302": {"name": "昭通市", "parent": "26"},
-    "303": {"name": "丽江市", "parent": "26"},
-    "304": {"name": "普洱市", "parent": "26"},
-    "305": {"name": "临沧市", "parent": "26"},
-    "306": {"name": "楚雄彝族自治州", "parent": "26"},
-    "307": {"name": "红河哈尼族彝族自治州", "parent": "26"},
-    "308": {"name": "文山壮族苗族自治州", "parent": "26"},
-    "309": {"name": "西双版纳傣族自治州", "parent": "26"},
-    "310": {"name": "大理白族自治州", "parent": "26"},
-    "311": {"name": "德宏傣族景颇族自治州", "parent": "26"},
-    "312": {"name": "怒江傈僳族自治州", "parent": "26"},
-    "313": {"name": "迪庆藏族自治州", "parent": "26"},
-    "314": {"name": "拉萨市", "parent": "27"},
-    "315": {"name": "昌都市", "parent": "27"},
-    "316": {"name": "山南市", "parent": "27"},
-    "317": {"name": "日喀则市", "parent": "27"},
-    "318": {"name": "那曲市", "parent": "27"},
-    "319": {"name": "阿里地区", "parent": "27"},
-    "320": {"name": "林芝市", "parent": "27"},
-    "321": {"name": "西安市", "parent": "28"},
-    "322": {"name": "铜川市", "parent": "28"},
-    "323": {"name": "宝鸡市", "parent": "28"},
-    "324": {"name": "咸阳市", "parent": "28"},
-    "325": {"name": "渭南市", "parent": "28"},
-    "326": {"name": "延安市", "parent": "28"},
-    "327": {"name": "汉中市", "parent": "28"},
-    "328": {"name": "榆林市", "parent": "28"},
-    "329": {"name": "安康市", "parent": "28"},
-    "330": {"name": "商洛市", "parent": "28"},
-    "331": {"name": "兰州市", "parent": "29"},
-    "332": {"name": "嘉峪关市", "parent": "29"},
-    "333": {"name": "金昌市", "parent": "29"},
-    "334": {"name": "白银市", "parent": "29"},
-    "335": {"name": "天水市", "parent": "29"},
-    "336": {"name": "武威市", "parent": "29"},
-    "337": {"name": "张掖市", "parent": "29"},
-    "338": {"name": "平凉市", "parent": "29"},
-    "339": {"name": "酒泉市", "parent": "29"},
-    "340": {"name": "庆阳市", "parent": "29"},
-    "341": {"name": "定西市", "parent": "29"},
-    "342": {"name": "陇南市", "parent": "29"},
-    "343": {"name": "临夏回族自治州", "parent": "29"},
-    "344": {"name": "甘南藏族自治州", "parent": "29"},
-    "345": {"name": "西宁市", "parent": "30"},
-    "346": {"name": "海东市", "parent": "30"},
-    "347": {"name": "海北藏族自治州", "parent": "30"},
-    "348": {"name": "黄南藏族自治州", "parent": "30"},
-    "349": {"name": "海南藏族自治州", "parent": "30"},
-    "350": {"name": "果洛藏族自治州", "parent": "30"},
-    "351": {"name": "玉树藏族自治州", "parent": "30"},
-    "352": {"name": "海西蒙古族藏族自治州", "parent": "30"},
-    "353": {"name": "银川市", "parent": "31"},
-    "354": {"name": "石嘴山市", "parent": "31"},
-    "355": {"name": "吴忠市", "parent": "31"},
-    "356": {"name": "固原市", "parent": "31"},
-    "357": {"name": "中卫市", "parent": "31"},
-    "358": {"name": "乌鲁木齐市", "parent": "32"},
-    "359": {"name": "克拉玛依市", "parent": "32"},
-    "360": {"name": "吐鲁番市", "parent": "32"},
-    "361": {"name": "哈密市", "parent": "32"},
-    "362": {"name": "昌吉回族自治州", "parent": "32"},
-    "363": {"name": "博尔塔拉蒙古自治州", "parent": "32"},
-    "364": {"name": "巴音郭楞蒙古自治州", "parent": "32"},
-    "365": {"name": "阿克苏地区", "parent": "32"},
-    "366": {"name": "克孜勒苏柯尔克孜自治州", "parent": "32"},
-    "367": {"name": "喀什地区", "parent": "32"},
-    "368": {"name": "和田地区", "parent": "32"},
-    "369": {"name": "伊犁哈萨克自治州", "parent": "32"},
-    "370": {"name": "塔城地区", "parent": "32"},
-    "371": {"name": "阿勒泰地区", "parent": "32"},
-    "372": {"name": "新疆省直辖行政单位", "parent": "32"},
-    "373": {"name": "可克达拉市", "parent": "32"},
-    "374": {"name": "昆玉市", "parent": "32"},
-    "375": {"name": "胡杨河市", "parent": "32"},
-    "376": {"name": "双河市", "parent": "32"},
-    "3560": {"name": "北票市", "parent": "7"},
-    "3615": {"name": "高州市", "parent": "20"},
-    "3651": {"name": "济源市", "parent": "17"},
-    "3662": {"name": "胶南市", "parent": "16"},
-    "3683": {"name": "老河口市", "parent": "18"},
-    "3758": {"name": "沙河市", "parent": "4"},
-    "3822": {"name": "宜城市", "parent": "18"},
-    "3842": {"name": "枣阳市", "parent": "18"},
-    "3850": {"name": "肇东市", "parent": "9"},
-    "3905": {"name": "澳门", "parent": "1"},
-    "3906": {"name": "澳门", "parent": "3905"},
-    "3907": {"name": "香港", "parent": "1"},
-    "3908": {"name": "香港", "parent": "3907"},
-    "3947": {"name": "仙桃市", "parent": "18"},
-    "3954": {"name": "台湾", "parent": "1"},
-    "3955": {"name": "台湾", "parent": "3954"},
-    "3956": {"name": "海外", "parent": "1"},
-    "3957": {"name": "海外", "parent": "3956"},
-    "3958": {"name": "美国", "parent": "3956"},
-    "3959": {"name": "加拿大", "parent": "3956"},
-    "3961": {"name": "日本", "parent": "3956"},
-    "3962": {"name": "韩国", "parent": "3956"},
-    "3963": {"name": "德国", "parent": "3956"},
-    "3964": {"name": "英国", "parent": "3956"},
-    "3965": {"name": "意大利", "parent": "3956"},
-    "3966": {"name": "西班牙", "parent": "3956"},
-    "3967": {"name": "法国", "parent": "3956"},
-    "3968": {"name": "澳大利亚", "parent": "3956"},
-    "3969": {"name": "东城区", "parent": "2"},
-    "3970": {"name": "西城区", "parent": "2"},
-    "3971": {"name": "崇文区", "parent": "2"},
-    "3972": {"name": "宣武区", "parent": "2"},
-    "3973": {"name": "朝阳区", "parent": "2"},
-    "3974": {"name": "海淀区", "parent": "2"},
-    "3975": {"name": "丰台区", "parent": "2"},
-    "3976": {"name": "石景山区", "parent": "2"},
-    "3977": {"name": "门头沟区", "parent": "2"},
-    "3978": {"name": "房山区", "parent": "2"},
-    "3979": {"name": "通州区", "parent": "2"},
-    "3980": {"name": "顺义区", "parent": "2"},
-    "3981": {"name": "昌平区", "parent": "2"},
-    "3982": {"name": "大兴区", "parent": "2"},
-    "3983": {"name": "平谷区", "parent": "2"},
-    "3984": {"name": "怀柔区", "parent": "2"},
-    "3985": {"name": "密云区", "parent": "2"},
-    "3986": {"name": "延庆区", "parent": "2"},
-    "3987": {"name": "黄浦区", "parent": "10"},
-    "3988": {"name": "徐汇区", "parent": "10"},
-    "3989": {"name": "长宁区", "parent": "10"},
-    "3990": {"name": "静安区", "parent": "10"},
-    "3991": {"name": "普陀区", "parent": "10"},
-    "3992": {"name": "闸北区", "parent": "10"},
-    "3993": {"name": "虹口区", "parent": "10"},
-    "3994": {"name": "杨浦区", "parent": "10"},
-    "3995": {"name": "宝山区", "parent": "10"},
-    "3996": {"name": "闵行区", "parent": "10"},
-    "3997": {"name": "嘉定区", "parent": "10"},
-    "3998": {"name": "浦东新区", "parent": "10"},
-    "3999": {"name": "松江区", "parent": "10"},
-    "4000": {"name": "金山区", "parent": "10"},
-    "4001": {"name": "青浦区", "parent": "10"},
-    "4002": {"name": "奉贤区", "parent": "10"},
-    "4003": {"name": "崇明区", "parent": "10"},
-    "4004": {"name": "和平区", "parent": "3"},
-    "4005": {"name": "河东区", "parent": "3"},
-    "4006": {"name": "河西区", "parent": "3"},
-    "4007": {"name": "南开区", "parent": "3"},
-    "4008": {"name": "红桥区", "parent": "3"},
-    "4009": {"name": "河北区", "parent": "3"},
-    "4010": {"name": "滨海新区", "parent": "3"},
-    "4011": {"name": "东丽区", "parent": "3"},
-    "4012": {"name": "西青区", "parent": "3"},
-    "4013": {"name": "北辰区", "parent": "3"},
-    "4014": {"name": "津南区", "parent": "3"},
-    "4015": {"name": "武清区", "parent": "3"},
-    "4016": {"name": "宝坻区", "parent": "3"},
-    "4017": {"name": "静海区", "parent": "3"},
-    "4018": {"name": "宁河区", "parent": "3"},
-    "4019": {"name": "蓟州区", "parent": "3"},
-    "4020": {"name": "渝中区", "parent": "23"},
-    "4021": {"name": "江北区", "parent": "23"},
-    "4022": {"name": "南岸区", "parent": "23"},
-    "4023": {"name": "沙坪坝区", "parent": "23"},
-    "4024": {"name": "九龙坡区", "parent": "23"},
-    "4025": {"name": "大渡口区", "parent": "23"},
-    "4026": {"name": "渝北区", "parent": "23"},
-    "4027": {"name": "巴南区", "parent": "23"},
-    "4028": {"name": "北碚区", "parent": "23"},
-    "4029": {"name": "万州区", "parent": "23"},
-    "4030": {"name": "黔江区", "parent": "23"},
-    "4031": {"name": "永川区", "parent": "23"},
-    "4032": {"name": "涪陵区", "parent": "23"},
-    "4033": {"name": "江津区", "parent": "23"},
-    "4034": {"name": "合川区", "parent": "23"},
-    "4035": {"name": "双桥区", "parent": "23"},
-    "4036": {"name": "万盛区", "parent": "23"},
-    "4037": {"name": "荣昌区", "parent": "23"},
-    "4038": {"name": "大足区", "parent": "23"},
-    "4039": {"name": "璧山区", "parent": "23"},
-    "4040": {"name": "铜梁区", "parent": "23"},
-    "4041": {"name": "潼南区", "parent": "23"},
-    "4042": {"name": "綦江区", "parent": "23"},
-    "4043": {"name": "忠县", "parent": "23"},
-    "4044": {"name": "开州区", "parent": "23"},
-    "4045": {"name": "云阳县", "parent": "23"},
-    "4046": {"name": "梁平区", "parent": "23"},
-    "4047": {"name": "垫江县", "parent": "23"},
-    "4048": {"name": "丰都县", "parent": "23"},
-    "4049": {"name": "奉节县", "parent": "23"},
-    "4050": {"name": "巫山县", "parent": "23"},
-    "4051": {"name": "巫溪县", "parent": "23"},
-    "4052": {"name": "城口县", "parent": "23"},
-    "4053": {"name": "武隆区", "parent": "23"},
-    "4054": {"name": "石柱土家族自治县", "parent": "23"},
-    "4055": {"name": "秀山土家族苗族自治县", "parent": "23"},
-    "4056": {"name": "酉阳土家族苗族自治县", "parent": "23"},
-    "4057": {"name": "彭水苗族土家族自治县", "parent": "23"},
-    "4058": {"name": "潜江市", "parent": "18"},
-    "4059": {"name": "三沙市", "parent": "22"},
-    "4060": {"name": "石河子市", "parent": "32"},
-    "4061": {"name": "阿拉尔市", "parent": "32"},
-    "4062": {"name": "图木舒克市", "parent": "32"},
-    "4063": {"name": "五家渠市", "parent": "32"},
-    "4064": {"name": "北屯市", "parent": "32"},
-    "4065": {"name": "铁门关市", "parent": "32"},
-    "4066": {"name": "儋州市", "parent": "22"},
-    "4067": {"name": "五指山市", "parent": "22"},
-    "4068": {"name": "文昌市", "parent": "22"},
-    "4069": {"name": "琼海市", "parent": "22"},
-    "4070": {"name": "万宁市", "parent": "22"},
-    "4072": {"name": "定安县", "parent": "22"},
-    "4073": {"name": "屯昌县", "parent": "22"},
-    "4074": {"name": "澄迈县", "parent": "22"},
-    "4075": {"name": "临高县", "parent": "22"},
-    "4076": {"name": "琼中黎族苗族自治县", "parent": "22"},
-    "4077": {"name": "保亭黎族苗族自治县", "parent": "22"},
-    "4078": {"name": "白沙黎族自治县", "parent": "22"},
-    "4079": {"name": "昌江黎族自治县", "parent": "22"},
-    "4080": {"name": "乐东黎族自治县", "parent": "22"},
-    "4081": {"name": "陵水黎族自治县", "parent": "22"},
-    "4082": {"name": "马来西亚", "parent": "3956"},
-    "6047": {"name": "长寿区", "parent": "23"},
-    "6857": {"name": "阿富汗", "parent": "3956"},
-    "6858": {"name": "阿尔巴尼亚", "parent": "3956"},
-    "6859": {"name": "阿尔及利亚", "parent": "3956"},
-    "6860": {"name": "美属萨摩亚", "parent": "3956"},
-    "6861": {"name": "安道尔", "parent": "3956"},
-    "6862": {"name": "安哥拉", "parent": "3956"},
-    "6863": {"name": "安圭拉", "parent": "3956"},
-    "6864": {"name": "南极洲", "parent": "3956"},
-    "6865": {"name": "安提瓜和巴布达", "parent": "3956"},
-    "6866": {"name": "阿根廷", "parent": "3956"},
-    "6867": {"name": "亚美尼亚", "parent": "3956"},
-    "6869": {"name": "奥地利", "parent": "3956"},
-    "6870": {"name": "阿塞拜疆", "parent": "3956"},
-    "6871": {"name": "巴哈马", "parent": "3956"},
-    "6872": {"name": "巴林", "parent": "3956"},
-    "6873": {"name": "孟加拉国", "parent": "3956"},
-    "6874": {"name": "巴巴多斯", "parent": "3956"},
-    "6875": {"name": "白俄罗斯", "parent": "3956"},
-    "6876": {"name": "比利时", "parent": "3956"},
-    "6877": {"name": "伯利兹", "parent": "3956"},
-    "6878": {"name": "贝宁", "parent": "3956"},
-    "6879": {"name": "百慕大", "parent": "3956"},
-    "6880": {"name": "不丹", "parent": "3956"},
-    "6881": {"name": "玻利维亚", "parent": "3956"},
-    "6882": {"name": "波黑", "parent": "3956"},
-    "6883": {"name": "博茨瓦纳", "parent": "3956"},
-    "6884": {"name": "布维岛", "parent": "3956"},
-    "6885": {"name": "巴西", "parent": "3956"},
-    "6886": {"name": "英属印度洋领土", "parent": "3956"},
-    "6887": {"name": "文莱", "parent": "3956"},
-    "6888": {"name": "保加利亚", "parent": "3956"},
-    "6889": {"name": "布基纳法索", "parent": "3956"},
-    "6890": {"name": "布隆迪", "parent": "3956"},
-    "6891": {"name": "柬埔寨", "parent": "3956"},
-    "6892": {"name": "喀麦隆", "parent": "3956"},
-    "6893": {"name": "佛得角", "parent": "3956"},
-    "6894": {"name": "开曼群岛", "parent": "3956"},
-    "6895": {"name": "中非", "parent": "3956"},
-    "6896": {"name": "乍得", "parent": "3956"},
-    "6897": {"name": "智利", "parent": "3956"},
-    "6898": {"name": "圣诞岛", "parent": "3956"},
-    "6899": {"name": "科科斯（基林）群岛", "parent": "3956"},
-    "6900": {"name": "哥伦比亚", "parent": "3956"},
-    "6901": {"name": "科摩罗", "parent": "3956"},
-    "6902": {"name": "刚果（布）", "parent": "3956"},
-    "6903": {"name": "刚果（金）", "parent": "3956"},
-    "6904": {"name": "库克群岛", "parent": "3956"},
-    "6905": {"name": "哥斯达黎加", "parent": "3956"},
-    "6906": {"name": "科特迪瓦", "parent": "3956"},
-    "6907": {"name": "克罗地亚", "parent": "3956"},
-    "6908": {"name": "古巴", "parent": "3956"},
-    "6909": {"name": "塞浦路斯", "parent": "3956"},
-    "6910": {"name": "捷克", "parent": "3956"},
-    "6911": {"name": "丹麦", "parent": "3956"},
-    "6912": {"name": "吉布提", "parent": "3956"},
-    "6913": {"name": "多米尼克", "parent": "3956"},
-    "6914": {"name": "多米尼加共和国", "parent": "3956"},
-    "6915": {"name": "东帝汶", "parent": "3956"},
-    "6916": {"name": "厄瓜多尔", "parent": "3956"},
-    "6917": {"name": "埃及", "parent": "3956"},
-    "6918": {"name": "萨尔瓦多", "parent": "3956"},
-    "6919": {"name": "赤道几内亚", "parent": "3956"},
-    "6920": {"name": "厄立特里亚", "parent": "3956"},
-    "6921": {"name": "爱沙尼亚", "parent": "3956"},
-    "6922": {"name": "埃塞俄比亚", "parent": "3956"},
-    "6923": {"name": "福克兰群岛（马尔维纳斯）", "parent": "3956"},
-    "6924": {"name": "法罗群岛", "parent": "3956"},
-    "6925": {"name": "斐济", "parent": "3956"},
-    "6926": {"name": "芬兰", "parent": "3956"},
-    "6927": {"name": "法属圭亚那", "parent": "3956"},
-    "6928": {"name": "法属波利尼西亚", "parent": "3956"},
-    "6929": {"name": "法属南部领土", "parent": "3956"},
-    "6930": {"name": "加蓬", "parent": "3956"},
-    "6931": {"name": "冈比亚", "parent": "3956"},
-    "6932": {"name": "格鲁吉亚", "parent": "3956"},
-    "6933": {"name": "加纳", "parent": "3956"},
-    "6934": {"name": "直布罗陀", "parent": "3956"},
-    "6935": {"name": "希腊", "parent": "3956"},
-    "6936": {"name": "格陵兰", "parent": "3956"},
-    "6937": {"name": "格林纳达", "parent": "3956"},
-    "6938": {"name": "瓜德罗普", "parent": "3956"},
-    "6939": {"name": "关岛", "parent": "3956"},
-    "6940": {"name": "危地马拉", "parent": "3956"},
-    "6941": {"name": "几内亚", "parent": "3956"},
-    "6942": {"name": "几内亚比绍", "parent": "3956"},
-    "6943": {"name": "圭亚那", "parent": "3956"},
-    "6944": {"name": "海地", "parent": "3956"},
-    "6945": {"name": "赫德岛和麦克唐纳岛", "parent": "3956"},
-    "6946": {"name": "洪都拉斯", "parent": "3956"},
-    "6947": {"name": "匈牙利", "parent": "3956"},
-    "6948": {"name": "冰岛", "parent": "3956"},
-    "6949": {"name": "印度", "parent": "3956"},
-    "6950": {"name": "印度尼西亚", "parent": "3956"},
-    "6951": {"name": "伊朗", "parent": "3956"},
-    "6952": {"name": "伊拉克", "parent": "3956"},
-    "6953": {"name": "爱尔兰", "parent": "3956"},
-    "6954": {"name": "以色列", "parent": "3956"},
-    "6955": {"name": "牙买加", "parent": "3956"},
-    "6956": {"name": "约旦", "parent": "3956"},
-    "6957": {"name": "哈萨克斯坦", "parent": "3956"},
-    "6958": {"name": "肯尼亚", "parent": "3956"},
-    "6959": {"name": "基里巴斯", "parent": "3956"},
-    "6960": {"name": "朝鲜", "parent": "3956"},
-    "6961": {"name": "科威特", "parent": "3956"},
-    "6962": {"name": "吉尔吉斯斯坦", "parent": "3956"},
-    "6963": {"name": "老挝", "parent": "3956"},
-    "6964": {"name": "拉脱维亚", "parent": "3956"},
-    "6965": {"name": "黎巴嫩", "parent": "3956"},
-    "6966": {"name": "莱索托", "parent": "3956"},
-    "6967": {"name": "利比里亚", "parent": "3956"},
-    "6968": {"name": "利比亚", "parent": "3956"},
-    "6969": {"name": "列支敦士登", "parent": "3956"},
-    "6970": {"name": "立陶宛", "parent": "3956"},
-    "6971": {"name": "卢森堡", "parent": "3956"},
-    "6972": {"name": "前南马其顿", "parent": "3956"},
-    "6973": {"name": "马达加斯加", "parent": "3956"},
-    "6974": {"name": "马拉维", "parent": "3956"},
-    "6975": {"name": "马尔代夫", "parent": "3956"},
-    "6976": {"name": "马里", "parent": "3956"},
-    "6977": {"name": "马耳他", "parent": "3956"},
-    "6978": {"name": "马绍尔群岛", "parent": "3956"},
-    "6979": {"name": "马提尼克", "parent": "3956"},
-    "6980": {"name": "毛里塔尼亚", "parent": "3956"},
-    "6981": {"name": "毛里求斯", "parent": "3956"},
-    "6982": {"name": "马约特", "parent": "3956"},
-    "6983": {"name": "墨西哥", "parent": "3956"},
-    "6984": {"name": "密克罗尼西亚联邦", "parent": "3956"},
-    "6985": {"name": "摩尔多瓦", "parent": "3956"},
-    "6986": {"name": "摩纳哥", "parent": "3956"},
-    "6987": {"name": "蒙古", "parent": "3956"},
-    "6988": {"name": "蒙特塞拉特", "parent": "3956"},
-    "6989": {"name": "摩洛哥", "parent": "3956"},
-    "6990": {"name": "莫桑比克", "parent": "3956"},
-    "6991": {"name": "缅甸", "parent": "3956"},
-    "6992": {"name": "纳米比亚", "parent": "3956"},
-    "6993": {"name": "瑙鲁", "parent": "3956"},
-    "6994": {"name": "尼泊尔", "parent": "3956"},
-    "6995": {"name": "荷兰", "parent": "3956"},
-    "6996": {"name": "荷属安的列斯", "parent": "3956"},
-    "6997": {"name": "新喀里多尼亚", "parent": "3956"},
-    "6998": {"name": "新西兰", "parent": "3956"},
-    "6999": {"name": "尼加拉瓜", "parent": "3956"},
-    "7000": {"name": "尼日尔", "parent": "3956"},
-    "7001": {"name": "尼日利亚", "parent": "3956"},
-    "7002": {"name": "纽埃", "parent": "3956"},
-    "7003": {"name": "诺福克岛", "parent": "3956"},
-    "7004": {"name": "北马里亚纳", "parent": "3956"},
-    "7005": {"name": "挪威", "parent": "3956"},
-    "7006": {"name": "阿曼", "parent": "3956"},
-    "7007": {"name": "巴基斯坦", "parent": "3956"},
-    "7008": {"name": "帕劳", "parent": "3956"},
-    "7009": {"name": "巴勒斯坦", "parent": "3956"},
-    "7010": {"name": "巴拿马", "parent": "3956"},
-    "7011": {"name": "巴布亚新几内亚", "parent": "3956"},
-    "7012": {"name": "巴拉圭", "parent": "3956"},
-    "7013": {"name": "秘鲁", "parent": "3956"},
-    "7014": {"name": "菲律宾", "parent": "3956"},
-    "7015": {"name": "皮特凯恩群岛", "parent": "3956"},
-    "7016": {"name": "波兰", "parent": "3956"},
-    "7017": {"name": "葡萄牙", "parent": "3956"},
-    "7018": {"name": "波多黎各", "parent": "3956"},
-    "7019": {"name": "卡塔尔", "parent": "3956"},
-    "7020": {"name": "留尼汪", "parent": "3956"},
-    "7021": {"name": "罗马尼亚", "parent": "3956"},
-    "7022": {"name": "俄罗斯联邦", "parent": "3956"},
-    "7023": {"name": "卢旺达", "parent": "3956"},
-    "7024": {"name": "圣赫勒拿", "parent": "3956"},
-    "7025": {"name": "圣基茨和尼维斯", "parent": "3956"},
-    "7026": {"name": "圣卢西亚", "parent": "3956"},
-    "7027": {"name": "圣皮埃尔和密克隆", "parent": "3956"},
-    "7028": {"name": "圣文森特和格林纳丁斯", "parent": "3956"},
-    "7029": {"name": "萨摩亚", "parent": "3956"},
-    "7030": {"name": "圣马力诺", "parent": "3956"},
-    "7031": {"name": "圣多美和普林西比", "parent": "3956"},
-    "7032": {"name": "沙特阿拉伯", "parent": "3956"},
-    "7033": {"name": "塞内加尔", "parent": "3956"},
-    "7034": {"name": "塞舌尔", "parent": "3956"},
-    "7035": {"name": "塞拉利昂", "parent": "3956"},
-    "7036": {"name": "新加坡", "parent": "3956"},
-    "7037": {"name": "斯洛伐克", "parent": "3956"},
-    "7038": {"name": "斯洛文尼亚", "parent": "3956"},
-    "7039": {"name": "所罗门群岛", "parent": "3956"},
-    "7040": {"name": "索马里", "parent": "3956"},
-    "7041": {"name": "南非", "parent": "3956"},
-    "7042": {"name": "南乔治亚岛和南桑德韦奇岛", "parent": "3956"},
-    "7043": {"name": "斯里兰卡", "parent": "3956"},
-    "7044": {"name": "苏丹", "parent": "3956"},
-    "7045": {"name": "苏里南", "parent": "3956"},
-    "7046": {"name": "斯瓦尔巴群岛", "parent": "3956"},
-    "7047": {"name": "斯威士兰", "parent": "3956"},
-    "7048": {"name": "瑞典", "parent": "3956"},
-    "7049": {"name": "瑞士", "parent": "3956"},
-    "7050": {"name": "叙利亚", "parent": "3956"},
-    "7051": {"name": "塔吉克斯坦", "parent": "3956"},
-    "7052": {"name": "坦桑尼亚", "parent": "3956"},
-    "7053": {"name": "泰国", "parent": "3956"},
-    "7054": {"name": "多哥", "parent": "3956"},
-    "7055": {"name": "托克劳", "parent": "3956"},
-    "7056": {"name": "汤加", "parent": "3956"},
-    "7057": {"name": "特立尼达和多巴哥", "parent": "3956"},
-    "7058": {"name": "突尼斯", "parent": "3956"},
-    "7059": {"name": "土耳其", "parent": "3956"},
-    "7060": {"name": "土库曼斯坦", "parent": "3956"},
-    "7061": {"name": "特克斯科斯群岛", "parent": "3956"},
-    "7062": {"name": "图瓦卢", "parent": "3956"},
-    "7063": {"name": "乌干达", "parent": "3956"},
-    "7064": {"name": "乌克兰", "parent": "3956"},
-    "7065": {"name": "阿联酋", "parent": "3956"},
-    "7066": {"name": "美国本土外小岛屿", "parent": "3956"},
-    "7067": {"name": "乌拉圭", "parent": "3956"},
-    "7068": {"name": "乌兹别克斯坦", "parent": "3956"},
-    "7069": {"name": "瓦努阿图", "parent": "3956"},
-    "7070": {"name": "梵蒂冈", "parent": "3956"},
-    "7071": {"name": "委内瑞拉", "parent": "3956"},
-    "7072": {"name": "越南", "parent": "3956"},
-    "7073": {"name": "英属维尔京群岛", "parent": "3956"},
-    "7074": {"name": "美属维尔京群岛", "parent": "3956"},
-    "7075": {"name": "瓦利斯和富图纳", "parent": "3956"},
-    "7076": {"name": "西撒哈拉", "parent": "3956"},
-    "7077": {"name": "也门", "parent": "3956"},
-    "7078": {"name": "南斯拉夫", "parent": "3956"},
-    "7079": {"name": "赞比亚", "parent": "3956"},
-    "7080": {"name": "津巴布韦", "parent": "3956"},
-    "7081": {"name": "塞尔维亚", "parent": "3956"},
-    "7082": {"name": "雄安新区", "parent": "4"},
-    "7084": {"name": "天门市", "parent": "18"},
-}
-
-NM_SET = set([v["name"] for _, v in TBL.items()])
-
-
-def get_names(id):
-    if not id or str(id).lower() == "none":
-        return []
-    id = str(id)
-    if not re.match("[0-9]+$", id.strip()):
-        return [id]
-    nms = []
-    d = TBL.get(id)
-    if not d:
-        return []
-    nms.append(d["name"])
-    p = get_names(d["parent"])
-    if p:
-        nms.extend(p)
-    return nms
-
-
-
-def isName(nm):
-    if nm in NM_SET:
-        return True
-    if nm + "市" in NM_SET:
-        return True
-    if re.sub(r"(省|(回族|壮族|维吾尔)*自治区)$", "", nm) in NM_SET:
-        return True
-    return False
--- a/deepdoc/parser/resume/entities/res/corp.tks.freq.json
+++ b/deepdoc/parser/resume/entities/res/corp.tks.freq.json
@ -1,65 +0,0 @@
-[
-        "科技",
-        "集团",
-        "网络科技",
-        "技术",
-        "信息",
-        "分公司",
-        "信息技术",
-        "发展",
-        "科技股份",
-        "网络",
-        "贸易",
-        "商贸",
-        "工程",
-        "企业",
-        "集团股份",
-        "商务",
-        "工业",
-        "控股集团",
-        "国际贸易",
-        "软件技术",
-        "数码科技",
-        "软件开发",
-        "有限",
-        "经营",
-        "科技开发",
-        "股份公司",
-        "电子技术",
-        "实业集团",
-        "责任",
-        "无限",
-        "工程技术",
-        "上市公司",
-        "技术开发",
-        "软件系统",
-        "总公司",
-        "网络服务",
-        "ltd.",
-        "technology",
-        "company",
-        "服务公司",
-        "计算机技术",
-        "计算机软件",
-        "电子信息",
-        "corporation",
-        "计算机服务",
-        "计算机系统",
-	"有限公司",
-	"事业部",
-	"公司",
-	"股份",
-	"有限责任",
-	"软件",
-	"控股",
-	"高科技",
-	"房地产",
-	"事业群",
-	"部门",
-	"电子商务",
-	"人力资源顾问",
-	"人力资源",
-	"株式会社",
-	"网络营销"
-]
-
--- a/deepdoc/parser/resume/entities/res/corp_baike_len.csv
+++ b/deepdoc/parser/resume/entities/res/corp_baike_len.csv
--- a/deepdoc/parser/resume/entities/res/corp_tag.json
+++ b/deepdoc/parser/resume/entities/res/corp_tag.json
--- a/deepdoc/parser/resume/entities/res/good_corp.json
+++ b/deepdoc/parser/resume/entities/res/good_corp.json
@ -1,911 +0,0 @@
-[
-    "google assistant investments",
-    "amazon",
-    "dingtalk china information",
-    "zhejiang alibaba communication",
-    "yunos",
-    "腾讯云",
-    "新浪新闻",
-    "网邻通",
-    "蚂蚁集团",
-    "大疆",
-    "恒生股份",
-    "sf express",
-    "智者天下",
-    "shanghai hema network",
-    "papayamobile",
-    "lexinfintech",
-    "industrial consumer finance",
-    "360搜索",
-    "世纪光速",
-    "迅雷区块链",
-    "赛盒科技",
-    "齐力电子商务",
-    "平安养老险",
-    "平安证券",
-    "平安好贷",
-    "五八新服",
-    "呯嘭智能",
-    "阿里妈妈",
-    "mdt",
-    "tencent",
-    "weibo",
-    "浪潮软件",
-    "阿里巴巴广告",
-    "mashang consumer finance",
-    "维沃",
-    "hqg , limited",
-    "moodys",
-    "搜狐支付",
-    "百度秀",
-    "新浪服务",
-    "零售通",
-    "同城艺龙",
-    "虾米音乐",
-    "贝壳集团",
-    "小米有品",
-    "滴滴自动驾驶",
-    "图记",
-    "阿里影业",
-    "卓联软件",
-    "zhejiang tmall",
-    "谷歌中国",
-    "hithink flush",
-    "时装科技",
-    "程会玩国际旅行社",
-    "amazon china holding limited",
-    "中信消金",
-    "当当比特物流",
-    "新浪新媒体咨询",
-    "tongcheng network",
-    "金山在线",
-    "shopping cart",
-    "犀互动",
-    "五八",
-    "bilibili",
-    "阿里星球",
-    "滴滴金科服务",
-    "美团",
-    "哈啰出行",
-    "face",
-    "平安健康",
-    "招商银行",
-    "连亚",
-    "盒马网络",
-    "b站",
-    "华为机器",
-    "shanghai mdt infotech",
-    "ping an healthkonnect",
-    "beijing home link real estate broker",
-    "花海仓",
-    "beijing jingdong shangke information",
-    "微影智能",
-    "酷狗游戏",
-    "health.pingan.com",
-    "众安",
-    "陌陌",
-    "海康威视数字",
-    "同程网",
-    "艾丁金融",
-    "知乎",
-    " lu",
-    "国际商业机器公司",
-    "捷信消费金融",
-    "恒生利融",
-    "china merchants bank",
-    "企鹅电竞",
-    "捷信信驰",
-    "360智能家居",
-    "小桔车服",
-    "homecredit",
-    "皮皮虾",
-    "畅游",
-    "聚爱聊",
-    "suning.com",
-    "途牛旅游网",
-    "花呗",
-    "盈店通",
-    "sina",
-    "阿里巴巴音乐",
-    "华为技术有限公司",
-    "国付宝",
-    "shanghai lianshang network",
-    "oppo",
-    "华为投资控股",
-    "beijing sohu new media information",
-    "times square",
-    "菜鸟物流",
-    "lingxing",
-    "jd digits",
-    "同程旅游",
-    "分期乐",
-    "火锅视频",
-    "天天快报",
-    "猎豹移动",
-    "五八人力资源",
-    "宝宝树",
-    "顺丰科技",
-    "上海西翠",
-    "诗程文化传播",
-    "dewu",
-    "领星网络",
-    "aliexpress",
-    "贝塔通科技",
-    "链家",
-    "花小猪",
-    "趣输入",
-    "搜狐新媒体",
-    "一淘",
-    "56",
-    "qq阅读",
-    "青桔单车",
-    "iflytek",
-    "每日优鲜电子商务",
-    "腾讯觅影",
-    "微医",
-    "松果网",
-    "paypal",
-    "递瑞供应链管理",
-    "领星",
-    "qunar",
-    "三快",
-    "lu.com",
-    "携程旅行网",
-    "新潮传媒",
-    "链家经纪",
-    "景域文化",
-    "阿里健康",
-    "pingpeng",
-    "聚划算",
-    "零机科技",
-    "街兔电单车",
-    "快乐购",
-    "华为数字能源",
-    "搜狐",
-    "陆家嘴国际金融资产交易市场",
-    "nanjing tuniu",
-    "亚马逊",
-    "苏宁易购",
-    "携程旅游",
-    "苏宁金服",
-    "babytree",
-    "悟空问答",
-    "同花顺",
-    "eastmoney",
-    "浪潮信息",
-    "滴滴智慧交通",
-    "beijing ruixun lingtong",
-    "平安综合金融服务",
-    "爱奇艺",
-    "小米集团",
-    "华为云",
-    "微店",
-    "恒生集团",
-    "网易有道",
-    "boccfc",
-    "世纪思速科技",
-    "海康消防",
-    "beijing xiaomi",
-    "众安科技",
-    "五八同城",
-    "霆程汽车租赁",
-    "云卖分销",
-    "乐信集团",
-    "蚂蚁",
-    "舶乐蜜电子商务",
-    "支付宝中国",
-    "砖块消消消",
-    "vivo",
-    "阿里互娱",
-    "中国平安",
-    "lingxihudong",
-    "百度网盘",
-    "1号店",
-    "字节跳动",
-    "京东科技",
-    "驴妈妈兴旅国际旅行社",
-    "hangzhou alibaba music",
-    "xunlei",
-    "灵犀互动娱乐",
-    "快手",
-    "youtube",
-    "连尚慧眼",
-    "腾讯体育",
-    "爱商在线",
-    "酷我音乐",
-    "金融壹账通",
-    "搜狗服务",
-    "banma information",
-    "a站",
-    "罗汉堂",
-    "薇仕网络",
-    "搜狐新闻",
-    "贝宝",
-    "薇仕",
-    "口袋时尚科技",
-    "穆迪咨询",
-    "新狐投资管理",
-    "hikvision",
-    "alimama china holding limited",
-    "超聚变数字",
-    "腾讯视频",
-    "恒生电子",
-    "百度游戏",
-    "绿洲",
-    "木瓜移动",
-    "红袖添香",
-    "店匠科技",
-    "易贝",
-    "一淘网",
-    "博览群书",
-    "唯品会",
-    "lazglobal",
-    "amap",
-    "芒果网",
-    "口碑",
-    "海康慧影",
-    "腾讯音乐娱乐",
-    "网易严选",
-    "微信",
-    "shenzhen lexin holding",
-    "hangzhou pingpeng intelligent",
-    "连尚网络",
-    "海思",
-    "isunor",
-    "蝉翼",
-    "阿里游戏",
-    "广州优视",
-    "优视",
-    "腾讯征信",
-    "识装",
-    "finserve.pingan.com",
-    "papaya",
-    "阅文",
-    "平安健康保险",
-    "考拉海购",
-    "网易印象",
-    "wifi万能钥匙",
-    "新浪互联服务",
-    "亚马逊云科技",
-    "迅雷看看",
-    "华为朗新科技",
-    "adyen hong kong limited",
-    "谷歌",
-    "得物",
-    "网心",
-    "cainiao network",
-    "沐瞳",
-    "linkedln",
-    "hundsun",
-    "阿里旅行",
-    "珍爱网",
-    "阿里巴巴通信",
-    "金山奇剑",
-    "tongtool",
-    "华为安捷信电气",
-    "快乐时代",
-    "平安寿险",
-    "微博",
-    "微跳蚤",
-    "oppo移动通信",
-    "毒",
-    "alimama",
-    "shoplazza",
-    "shenzhen dianjiang science and",
-    "众鸣世科",
-    "平安金融",
-    "狐友",
-    "维沃移动通信",
-    "tobosoft",
-    "齐力电商",
-    "ali",
-    "诚信通",
-    "行吟",
-    "跳舞的线",
-    "橙心优选",
-    "众安健康",
-    "亚马逊中国投资",
-    "德絮投资管理中心合伙",
-    "招联消费金融",
-    "百度文学",
-    "芝麻信用",
-    "阿里零售通",
-    "时装",
-    "花样直播",
-    "sogou",
-    "uc",
-    "海思半导体",
-    "zhongan online p&c insurance",
-    "新浪数字",
-    "驴妈妈旅游网",
-    "华为数字能源技术",
-    "京东数科",
-    "oracle",
-    "xiaomi",
-    "nyse",
-    "阳光消费金融",
-    "天天动听",
-    "大众点评",
-    "上海瑞家",
-    "trustpass",
-    "hundsun technologies",
-    "美团小贷",
-    "ebay",
-    "通途",
-    "tcl",
-    "鸿蒙",
-    "酷狗计算机",
-    "品诺保险",
-    "capitalg",
-    "康盛创想",
-    "58同城",
-    "闲鱼",
-    "微软",
-    "吉易付科技",
-    "理财通",
-    "ctrip",
-    "yy",
-    "华为数字",
-    "kingsoft",
-    "孙宁金融",
-    "房江湖经纪",
-    "youku",
-    "ant financial services group",
-    "盒马",
-    "sensetime",
-    "伊千网络",
-    "小豹ai翻译棒",
-    "shopify",
-    "前海微众银行",
-    "qd",
-    "gmail",
-    "pingpong",
-    "alibaba group holding limited",
-    "捷信时空电子商务",
-    "orientsec",
-    "乔戈里管理咨询",
-    "ant",
-    "锐讯灵通",
-    "兴业消费金融",
-    "京东叁佰陆拾度电子商务",
-    "新浪",
-    "优酷土豆",
-    "海康机器人",
-    "美团单车",
-    "海康存储",
-    "领英",
-    "阿里全球速卖通",
-    "美菜网",
-    "京邦达",
-    "安居客",
-    "阿里体育",
-    "相互宝",
-    "cloudwalk",
-    "百度智能云",
-    "贝壳",
-    "酷狗",
-    "sunshine consumer finance",
-    "掌宜",
-    "奇酷网",
-    "核新同花顺",
-    "阿里巴巴影业",
-    "节创",
-    "学而思网校",
-    "速途",
-    "途牛",
-    "阿里云计算",
-    "beijing sensetime",
-    "alibaba cloud",
-    "西瓜视频",
-    "美团优选",
-    "orient securities limited",
-    "华为朗新",
-    "店匠",
-    "shanghai weishi network",
-    "友盟",
-    "飞猪旅行",
-    "滴滴出行",
-    "alipay",
-    "mogu",
-    "dangdang",
-    "大麦网",
-    "汉军智能系统",
-    "百度地图",
-    "货车帮",
-    "狐狸金服",
-    "众安在线保险经纪",
-    "华为通信",
-    "新浪支付",
-    "zhihu",
-    "alibaba cloud computing",
-    "沙发视频",
-    "金山软件",
-    "ping an good doctor",
-    "携程",
-    "脉脉",
-    "youku information beijing",
-    "zhongan",
-    "艾丁软件",
-    "乒乓智能",
-    "蘑菇街",
-    "taobao",
-    "华为技术服务",
-    "仕承文化传播",
-    "安捷信",
-    "狐狸互联网小额贷款",
-    "节点迅捷",
-    "中国银行",
-    "搜镇",
-    "众安在线",
-    "dingtalk",
-    "云从科技",
-    "beijing jingbangda trade",
-    "moody s",
-    "滚动的天空",
-    "yl.pingan.com",
-    "奇虎",
-    "alihealth",
-    "芒果tv",
-    "lufax",
-    "美团打车",
-    "小桔",
-    "贝壳找房网",
-    "小米科技",
-    "vips",
-    "kindle",
-    "亚马逊服务",
-    "citic consumer finance",
-    "微众",
-    "搜狗智慧互联网医院",
-    "盒马鲜生",
-    "life.pinan.com",
-    "ph.com.cn",
-    "银联",
-    "cmbchina",
-    "平安金融科技咨询",
-    "微保",
-    "甲骨文中国",
-    "飞书",
-    "koubei shanghai information",
-    "企鹅辅导",
-    "斑马",
-    "平安租赁",
-    "云从",
-    "马上消费",
-    "hangzhou ali baba advertising",
-    "金山",
-    "赛盒",
-    "科大讯飞",
-    "金星创业投资",
-    "平安国际融资租赁",
-    "360你财富",
-    "西山居",
-    "shenzhen qianhai fourth paradigm data",
-    "海思光电子",
-    "猎户星空",
-    "网易公司",
-    "浪潮",
-    "粒粒橙传媒",
-    "招联金融",
-    "100. me",
-    "捷信信驰咨询",
-    "唯品仓",
-    "orient",
-    "趣拿",
-    "摩拜单车",
-    "天猫精灵",
-    "菜鸟",
-    "豹小贩",
-    "去哪儿",
-    "米家",
-    "哈啰单车",
-    "搜狐体育",
-    "shopify payments usa",
-    "高德软件",
-    "讯联智付",
-    "乐信",
-    "唯你搭",
-    "第四范式",
-    "菜鸟网络",
-    "同程",
-    "yy语音",
-    "浪潮云",
-    "东财",
-    "淘宝",
-    "寻梦",
-    "citic securities limited",
-    "青橙之旅",
-    "阿里巴巴",
-    "番茄小说",
-    "上海亿贝",
-    "inspur",
-    "babytree inc",
-    "海康智慧产业股权投资基金合伙合伙",
-    "adyen",
-    "艺龙",
-    "蚂蚁金服",
-    "平安金服",
-    "百度百科",
-    "unionpay",
-    "当当",
-    "阅文集团",
-    "东方财富",
-    "东方证券",
-    "哈罗单车",
-    "优酷",
-    "海康",
-    "alipay china network",
-    "网商银行",
-    "钧正",
-    "property.pingan.com",
-    "豹咖啡",
-    "网易",
-    "我爱cba",
-    "theduapp",
-    "360",
-    "金山数字娱乐",
-    "新浪阅读",
-    "alibabagames",
-    "顺丰",
-    "支点商贸",
-    "同程旅行",
-    "citic securities",
-    "ele.com",
-    "tal",
-    "fresh hema",
-    "运满满",
-    "贝壳网",
-    "酷狗音乐",
-    "鲜城",
-    "360健康",
-    "浪潮世科",
-    "迅雷网络",
-    "哔哩哔哩",
-    "华为电动",
-    "淘友天下",
-    "华多网络",
-    "xunlei networking technologies",
-    "云杉",
-    "当当网电子商务",
-    "津虹网络",
-    "wedoc cloud hangzhou holdings",
-    "alisports shanghai",
-    "旷视金智",
-    "钉钉中国",
-    "微影",
-    "金山快快",
-    "亿贝",
-    "wedoc",
-    "autonavi",
-    "哈啰助力车",
-    "google cloud",
-    "新浪乐居",
-    "京东股票",
-    "搜狗智慧远程医疗中心",
-    "中银消金",
-    "merchants union consumer finance",
-    "王者荣耀",
-    "百度手机",
-    "美团民宿",
-    "kaola",
-    "小屋",
-    "金山网络",
-    "来往",
-    "顺丰速运",
-    "腾讯课堂",
-    "百度在线网络",
-    "美团买菜",
-    "威视汽车",
-    "uc mobile",
-    "来赞达",
-    "平安健康医疗",
-    "豹小秘",
-    "尚网",
-    "哈勃投资",
-    " ping an insurance group of china ,",
-    "小米",
-    "360好药",
-    "qq音乐",
-    "lingxigames",
-    "faceu激萌",
-    "搜狗",
-    "sohu",
-    "满帮",
-    "vipshop",
-    "wishpost",
-    "金山世游",
-    "shanghai yibaimi network",
-    "1688",
-    "海康汽车",
-    "顺丰控股",
-    "华为",
-    "妙镜vr",
-    "paybkj.com",
-    "hellobike",
-    "豹来电",
-    "京东",
-    "驴妈妈",
-    "momo",
-    "平安健康险",
-    "哈勃科技",
-    "美菜",
-    "众安在线财产保险",
-    "海康威视",
-    "east money information",
-    "阿里云",
-    "蝉游记",
-    "余额宝",
-    "屋客",
-    "滴滴",
-    "shopify international limited",
-    "百度",
-    "阿里健康中国",
-    "阿里通信",
-    "微梦创科",
-    "微医云",
-    "轻颜相机",
-    "搜易居",
-    "趣店集团",
-    "美团云",
-    "ant group",
-    "金山云",
-    "beijing express hand",
-    "觅觅",
-    "支付宝",
-    "滴滴承信科技咨询服务",
-    "拼多多",
-    "众安运动",
-    "乞力电商",
-    "youcash",
-    "唯品金融",
-    "陆金所",
-    "本地生活",
-    "sz dji",
-    "海康智能",
-    "魔方网聘",
-    "青藤大学",
-    "international business machines",
-    "学而思",
-    "beijing zhongming century science and",
-    "猎豹清理大师",
-    "asinking",
-    "高德",
-    "苏宁",
-    "优酷网",
-    "艾丁",
-    "中银消费金融",
-    "京东健康",
-    "五八教育",
-    "pingpongx",
-    "搜狐时尚",
-    "阿里广告",
-    "平安财险",
-    "中邮消金",
-    "etao",
-    "怕怕",
-    "nyse:cmcm",
-    "华为培训中心",
-    "高德地图",
-    "云狐天下征信",
-    "大疆创新",
-    "连尚",
-    "壹佰米",
-    "康健公司",
-    "iqiyi.com",
-    "360安全云盘",
-    "馒头直播",
-    "淘友网",
-    "东方赢家",
-    "bank of china",
-    "微众银行",
-    "阿里巴巴国际站",
-    "虾米",
-    "去哪儿网",
-    "ctrip travel network shanghai",
-    "潇湘书院",
-    "腾讯",
-    "快乐阳光互动娱乐传媒",
-    "迅雷",
-    "weidian",
-    "滴滴货运",
-    "ping an puhui enterprise management",
-    "新浪仓石基金销售",
-    "搜狐焦点",
-    "alibaba pictures",
-    "wps",
-    "平安",
-    "lazmall",
-    "百度开放平台",
-    "兴业消金",
-    " 珍爱网",
-    "京东云",
-    "小红书",
-    "1688. com",
-    "如视智数",
-    "missfresh",
-    "pazl.pingan.cn",
-    "平安集团",
-    "kugou",
-    "懂车帝",
-    "斑马智行",
-    "浪潮集团",
-    "netease hangzhou network",
-    "pagd.net",
-    "探探",
-    "chinaliterature",
-    "amazon亚马逊",
-    "alphabet",
-    "当当文创手工艺品电子商务",
-    "五八邦",
-    "shenzhen zhenai network information",
-    "lingshoutong",
-    "字节",
-    "lvmama",
-    "金山办公",
-    "众安保险",
-    "时装信息",
-    "优视科技",
-    "guangzhou kugou",
-    "ibm",
-    "滴滴打车",
-    "beijing sogou information service",
-    "megvii",
-    "健谈哥",
-    "cloudwalk group",
-    "蜂联科技",
-    "冬云",
-    "京东尚科",
-    "钢琴块2",
-    "京东世纪",
-    "商汤",
-    "众鸣世纪",
-    "腾讯音乐",
-    "迅雷网文化",
-    "华为云计算技术",
-    "live.me",
-    "全球速卖通",
-    "快的打车",
-    "hello group inc",
-    "美丽说",
-    "suning",
-    "opengauss",
-    "lazada",
-    "tmall",
-    "acfun",
-    "当当网",
-    "中银",
-    "旷视科技",
-    "百度钱包",
-    "淘宝网",
-    "新浪微博",
-    "迅雷集团",
-    "中信消费金融",
-    "学而思教育",
-    "平安普惠",
-    "悟空跨境",
-    "irobotbox",
-    "平安产险",
-    "inspur group",
-    "世纪卓越快递服务",
-    "奇虎360",
-    "webank",
-    "偶藻",
-    "唯品支付",
-    "腾讯云计算",
-    "众安服务",
-    "亿之唐",
-    "beijing 58 information ttechnology",
-    "平安好医生",
-    "迅雷之锤",
-    "旅行小账本",
-    "芒果游戏",
-    "新浪传媒",
-    "旷镜博煊",
-    "全民k歌",
-    "滴滴支付",
-    "北京网心科技",
-    "挂号网",
-    "萤石",
-    "chinavision media group limited",
-    "猎豹安全大师",
-    "cmcm",
-    "趣店",
-    "蚂蚁财富",
-    "商汤科技",
-    "甲骨文",
-    "百度云",
-    "百度apollo",
-    "19 pay",
-    "stock.pingan.com",
-    "tiktok",
-    "alibaba pictures group limited",
-    "ele",
-    "考拉",
-    "天猫",
-    "腾讯优图",
-    "起点中文网",
-    "百度视频",
-    "shanghai bili bili",
-    "京东物流",
-    "ebay marketplaces gmbh",
-    "alibaba sport",
-    "wish",
-    "阿里巴巴中国",
-    "中国银联",
-    "alibaba china network",
-    "china ping an property insurance",
-    "百度糯米网",
-    "微软中国",
-    "一九付",
-    "4 paradigm",
-    "叮咚买菜",
-    "umeng",
-    "众鸣科技",
-    "平安财富通",
-    "google",
-    "巨量引擎",
-    "百度贴吧",
-    "beijing jingdong century information",
-    "讯飞",
-    "beijing yunshan information",
-    "满运软件",
-    "中邮消费金融",
-    "饿了么",
-    "alios",
-    "腾讯ai实验室",
-    "第四范式智能",
-    "瀚星创业投资",
-    "gradient ventures",
-    "microsoft",
-    "哈啰共享汽车",
-    "乞力电子商务",
-    "mscf",
-    "网易影业文化",
-    "铁友旅游咨询",
-    "kilimall",
-    "云企互联投资",
-    "ping an financial consulting",
-    "beijng jingdong century commerce",
-    "高德威智能交通系统",
-    "中友信息",
-    "平安医疗健康管理",
-    "eciticcfc",
-    "中信证券",
-    "fliggy",
-    "电子湾",
-    "旷云金智",
-    "微粒贷",
-    "rsi",
-    "滴滴云计算",
-    "google ventures",
-    "箐程",
-    "每日优鲜",
-    "音兔",
-    "拉扎斯",
-    "今日头条",
-    "乐信控股",
-    "猎豹浏览器",
-    "细微咨询",
-    "好未来",
-    "我乐",
-    "绘声绘色",
-    "抖音",
-    "搜狐新时代",
-    "飞猪",
-    "鹅厂",
-    "贝壳找房",
-    "tuniu",
-    "红马传媒文化",
-    "钉钉",
-    "马上消费金融",
-    "360手机",
-    "平安医保",
-    "快途",
-    "alibaba",
-    "小哈换电",
-    "大麦",
-    "恒睿人工智能研究院",
-    "谷歌资本",
-    "猎豹",
-    "穆迪信息"
-]
--- a/deepdoc/parser/resume/entities/res/good_sch.json
+++ b/deepdoc/parser/resume/entities/res/good_sch.json
@ -1,595 +0,0 @@
-[
-"中国科技大学",
-"国防科学技术大学",
-"清华大学",
-"清华",
-"tsinghua university",
-"thu",
-"北京大学",
-"北大",
-"beijing university",
-"pku",
-"中国科学技术大学",
-"中国科大",
-"中科大",
-"china science & technology university",
-"ustc",
-"复旦大学",
-"复旦",
-"fudan university",
-"fdu",
-"中国人民大学",
-"人大",
-"人民大学",
-"renmin university of china",
-"ruc",
-"上海交通大学",
-"上海交大",
-"shanghai jiao tong university",
-"sjtu",
-"南京大学",
-"南大",
-"nanjing university",
-"nju",
-"同济大学",
-"同济",
-"tongji university",
-"tongji",
-"浙江大学",
-"浙大",
-"zhejiang university",
-"zju",
-"南开大学",
-"南开",
-"nankai university",
-"nku",
-"北京航空航天大学",
-"北航",
-"beihang university",
-"buaa",
-"北京师范大学",
-"北师",
-"北师大",
-"beijing normal university",
-"bnu",
-"武汉大学",
-"武大",
-"wuhan university",
-"whu",
-"西安交通大学",
-"西安交大",
-"xi’an jiaotong university",
-"xjtu",
-"天津大学",
-"天大",
-"university of tianjin",
-"tju",
-"华中科技大学",
-"华中大",
-"central china university science and technology",
-"hust",
-"北京理工大学",
-"北理",
-"beijing institute of technology",
-"bit",
-"东南大学",
-"东大",
-"southeast china university",
-"seu",
-"中山大学",
-"中大",
-"zhongshan university",
-"sysu",
-"华东师范大学",
-"华师大",
-"east china normal university",
-"ecnu",
-"哈尔滨工业大学",
-"哈工大",
-"harbin institute of technology",
-"hit",
-"厦门大学",
-"厦大",
-"xiamen university",
-"xmu",
-"西北工业大学",
-"西工大",
-"西北工大",
-"northwestern polytechnical university",
-"npu",
-"中南大学",
-"中南",
-"middle and southern university",
-"csu",
-"大连理工大学",
-"大工",
-"institute of technology of dalian",
-"dut",
-"四川大学",
-"川大",
-"sichuan university",
-"scu",
-"电子科技大学",
-"电子科大",
-"university of electronic science and technology of china",
-"uestc",
-"华南理工大学",
-"华南理工",
-"institutes of technology of south china",
-"scut",
-"吉林大学",
-"吉大",
-"jilin university",
-"jlu",
-"湖南大学",
-"湖大",
-"hunan university",
-"hnu",
-"重庆大学",
-"重大",
-"university of chongqing",
-"cqu",
-"山东大学",
-"山大",
-"shandong university",
-"sdu",
-"中国农业大学",
-"中国农大",
-"china agricultural university",
-"cau",
-"中国海洋大学",
-"中国海大",
-"chinese marine university",
-"ouc",
-"中央民族大学",
-"中央民大",
-"central university for nationalities",
-"muc",
-"东北大学",
-"东北工学院",
-"northeastern university",
-"neu 或 nu",
-"兰州大学",
-"兰大",
-"lanzhou university",
-"lzu",
-"西北农林科技大学",
-"西农","西北农大",
-"northwest a&f university",
-"nwafu",
-"中国人民解放军国防科技大学",
-"国防科技大学","国防科大",
-"national university of defense technology",
-"nudt",
-"郑州大学",
-"郑大",
-"zhengzhou university",
-"zzu",
-"云南大学",
-"云大",
-"yunnan university",
-"ynu",
-"新疆大学",
-"新大",
-"xinjiang university",
-"xju",
-"北京交通大学",
-"北京交大",
-"beijing jiaotong university",
-"bjtu",
-"北京工业大学",
-"北工大",
-"beijing university of technology",
-"bjut",
-"北京科技大学",
-"北科大","北京科大",
-"university of science and technology beijing",
-"ustb",
-"北京化工大学",
-"北化",
-"beijing university of chemical technology",
-"buct",
-"北京邮电大学",
-"北邮",
-"beijing university of posts and telecommunications",
-"beijing university of post and telecommunications",
-"beijing university of post and telecommunication",
-"beijing university of posts and telecommunication",
-"bupt",
-"北京林业大学",
-"北林",
-"beijing forestry university",
-"bfu",
-"北京协和医学院",
-"协和医学院",
-"peking union medical college",
-"pumc",
-"北京中医药大学",
-"北中医",
-"beijing university of chinese medicine",
-"bucm",
-"首都师范大学",
-"首师大",
-"capital normal university",
-"cnu",
-"北京外国语大学",
-"北外",
-"beijing foreign studies university",
-"bfsu",
-"中国传媒大学",
-"中媒",
-"中传",
-"北京广播学院",
-"communication university of china",
-"cuc",
-"中央财经大学",
-"中央财大",
-"中财大",
-"the central university of finance and economics",
-"cufe",
-"对外经济贸易大学",
-"对外经贸大学",
-"贸大",
-"university of international business and economics",
-"uibe",
-"外交学院",
-"外院",
-"china foreign affairs university",
-"cfau",
-"中国人民公安大学",
-"公安大学",
-"people's public security university of china",
-"ppsuc",
-"北京体育大学",
-"北体大",
-"beijing sport university",
-"bsu",
-"中央音乐学院",
-"央音",
-"中央院",
-"central conservatory of music",
-"ccom",
-"中国音乐学院",
-"国音",
-"中国院",
-"china conservatory of music",
-"ccmusic",
-"中央美术学院",
-"央美",
-"central academy of fine art",
-"cafa",
-"中央戏剧学院",
-"中戏",
-"the central academy of drama",
-"tcad",
-"中国政法大学",
-"法大",
-"china university of political science and law",
-"zuc",
-"cupl",
-"中国科学院大学",
-"国科大",
-"科院大",
-"university of chinese academy of sciences",
-"ucas",
-"福州大学",
-"福大",
-"university of fuzhou",
-"fzu",
-"暨南大学",
-"暨大",
-"ji'nan university",
-"jnu",
-"广州中医药大学",
-"广中医",
-"traditional chinese medicine university of guangzhou",
-"gucm",
-"华南师范大学",
-"华南师大",
-"south china normal university",
-"scnu",
-"广西大学",
-"西大",
-"guangxi university",
-"gxu",
-"贵州大学",
-"贵大",
-"guizhou university",
-"gzu",
-"海南大学",
-"海大",
-"university of hainan",
-"hainu",
-"河南大学",
-"河大",
-"he'nan university",
-"henu",
-"哈尔滨工程大学",
-"哈工程",
-"harbin engineering university",
-"heu",
-"东北农业大学",
-"东北农大",
-"northeast agricultural university",
-"neau",
-"东北林业大学",
-"东北林大",
-"northeast forestry university",
-"nefu",
-"中国地质大学",
-"地大",
-"china university of geosciences",
-"cug",
-"武汉理工大学",
-"武汉理工",
-"wuhan university of technology",
-"wut",
-"华中农业大学",
-"华中农大",
-"华农",
-"central china agricultural university",
-"hzau",
-"华中师范大学",
-"华中师大",
-"华大",
-"central china normal university",
-"ccnu",
-"中南财经政法大学",
-"中南大",
-"zhongnan university of economics & law",
-"zuel",
-"湖南师范大学",
-"湖南师大",
-"hunan normal university",
-"hunnu",
-"延边大学",
-"延大",
-"yanbian university",
-"ybu",
-"东北师范大学",
-"东北师大",
-"northeast normal university",
-"nenu",
-"苏州大学",
-"苏大",
-"soochow university",
-"suda",
-"南京航空航天大学",
-"南航",
-"nanjing aero-space university",
-"nuaa",
-"南京理工大学",
-"南理工",
-"institutes of technology of nanjing",
-"njust",
-"中国矿业大学",
-"中国矿大",
-"china mining university",
-"cumt",
-"南京邮电大学",
-"南邮",
-"nanjing university of posts and telecommunications",
-"njupt",
-"河海大学",
-"河海",
-"river sea university",
-"hhu",
-"江南大学",
-"江南大",
-"jiangnan university",
-"jiangnan",
-"南京林业大学",
-"南林",
-"nanjing forestry university",
-"njfu",
-"南京信息工程大学",
-"南信大",
-"nanjing university of information science and technology",
-"nuist",
-"南京农业大学",
-"南农",
-"南农大",
-"南京农大",
-"agricultural university of nanjing",
-"njau",
-"nau",
-"南京中医药大学",
-"南中医",
-"nanjing university of chinese medicine",
-"njucm",
-"中国药科大学",
-"中国药大",
-"china medicine university",
-"cpu",
-"南京师范大学",
-"南京师大",
-"南师大",
-"南师",
-"nanjing normal university",
-"nnu",
-"南昌大学",
-"昌大",
-"university of nanchang","nanchang university",
-"ncu",
-"辽宁大学",
-"辽大",
-"liaoning university",
-"lnu",
-"大连海事大学",
-"大连海大",
-"海大",
-"maritime affairs university of dalian",
-"dmu",
-"内蒙古大学",
-"内大",
-"university of the inner mongol","inner mongolia university",
-"imu",
-"宁夏大学",
-"宁大",
-"ningxia university",
-"nxu",
-"青海大学",
-"清大",
-"qinghai university",
-"qhu",
-"中国石油大学",
-"中石大",
-"china university of petroleum beijing",
-"upc",
-"太原理工大学",
-"太原理工",
-"institutes of technology of taiyuan","taiyuan university of technology",
-"tyut",
-"西北大学",
-"西大",
-"northwest university",
-"nwu",
-"西安电子科技大学",
-"西电",
-"xidian university",
-"xdu",
-"长安大学",
-"长大",
-"chang`an university",
-"chu",
-"陕西师范大学",
-"陕西师大",
-"陕师大",
-"shaanxi normal university",
-"snnu",
-"第四军医大学",
-"空军军医大学","四医大",
-"air force medical university",
-"fmmu",
-"华东理工大学",
-"华理",
-"east china university of science",
-"ecust",
-"东华大学",
-"东华",
-"donghua university",
-"dhu",
-"上海海洋大学",
-"上海海大",
-"shanghai ocean university",
-"shou",
-"上海中医药大学",
-"上中医",
-"shanghai university of traditional chinese medicine",
-"shutcm",
-"上海外国语大学",
-"上外",
-"shanghai international studies university",
-"sisu",
-"上海财经大学",
-"上海财大",
-"上财",
-"shanghai university of finance",
-"sufe",
-"上海体育学院",
-"shanghai university of sport",
-"上海音乐学院",
-"上音",
-"shanghai conservatory of music",
-"shcm",
-"上海大学",
-"上大",
-"shanghai university",
-"第二军医大学",
-"海军军医大学",
-"naval medical university",
-"西南交通大学",
-"西南交大",
-"southwest jiaotong university",
-"swjtu",
-"西南石油大学",
-"西南石大",
-"southwest petroleum university",
-"swpu",
-"成都理工大学",
-"成都理工",
-"chengdu university of technology",
-"cdut ",
-"四川农业大学",
-"川农",
-"川农大",
-"sichuan agricultural university",
-"sicau",
-"成都中医药大学",
-"成中医",
-"chengdu university of tcm",
-"cdutcm",
-"西南财经大学",
-"西南财大",
-"西财",
-"southwestern university of finance and economics",
-"swufe",
-"天津工业大学",
-"天工大",
-"tianjin university of technology",
-"tgu",
-"天津医科大学",
-"天津医大",
-"medical university of tianjin",
-"tmu",
-"天津中医药大学",
-"天中",
-"tianjin university of traditional chinese medicine",
-"tutcm",
-"华北电力大学",
-"华电",
-"north china electric power university",
-"ncepu",
-"河北工业大学",
-"河工大",
-"hebei university of technology",
-"hebut",
-"西藏大学",
-"藏大",
-"tibet university",
-"tu",
-"石河子大学",
-"石大",
-"shihezi university",
-"中国美术学院",
-"中国美院",
-"国美",
-"china academy of art",
-"caa",
-"宁波大学",
-"宁大",
-"ningbo university",
-"nbu",
-"西南大学",
-"西大",
-"southwest university",
-"swu",
-"安徽大学",
-"安大",
-"university of anhui",
-"ahu",
-"合肥工业大学",
-"合肥工大",
-"合工大",
-"hefei university of technology",
-"hfut",
-"中国地质大学",
-"地大",
-"china university of geosciences",
-"cug",
-"中国地质大学",
-"地大",
-"北京地大",
-"cugb",
-"中国矿业大学",
-"中国矿大",
-"china university of mining & technology",
-"cumtb",
-"中国石油大学",
-"中石大",
-"石大",
-"china university of petroleum",
-"cup",
-"中国石油大学",
-"中石大",
-"cup"]
--- a/deepdoc/parser/resume/entities/res/school.rank.csv
+++ b/deepdoc/parser/resume/entities/res/school.rank.csv
--- a/deepdoc/parser/resume/entities/res/schools.csv
+++ b/deepdoc/parser/resume/entities/res/schools.csv
--- a/deepdoc/parser/resume/entities/schools.py
+++ b/deepdoc/parser/resume/entities/schools.py
@ -1,91 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import os
-import json
-import re
-import copy
-import pandas as pd
-
-current_file_path = os.path.dirname(os.path.abspath(__file__))
-TBL = pd.read_csv(
-    os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0
-).fillna("")
-TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
-GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r",encoding="utf-8"))
-GOOD_SCH = set([re.sub(r"[,. &（）()]+", "", c) for c in GOOD_SCH])
-
-
-def loadRank(fnm):
-    global TBL
-    TBL["rank"] = 1000000
-    with open(fnm, "r", encoding="utf-8") as f:
-        while True:
-            line = f.readline()
-            if not line:
-                break
-            line = line.strip("\n").split(",")
-            try:
-                nm, rk = line[0].strip(), int(line[1])
-                # assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
-                TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
-            except Exception:
-                pass
-
-
-loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
-
-
-def split(txt):
-    tks = []
-    for t in re.sub(r"[ \t]+", " ", txt).split():
-        if (
-            tks
-            and re.match(r".*[a-zA-Z]$", tks[-1])
-            and re.match(r"[a-zA-Z]", t)
-            and tks
-        ):
-            tks[-1] = tks[-1] + " " + t
-        else:
-            tks.append(t)
-    return tks
-
-
-def select(nm):
-    global TBL
-    if not nm:
-        return
-    if isinstance(nm, list):
-        nm = str(nm[0])
-    nm = split(nm)[0]
-    nm = str(nm).lower().strip()
-    nm = re.sub(r"[(（][^()（）]+[)）]", "", nm.lower())
-    nm = re.sub(r"(^the |[,.&（）();；·]+|^(英国|美国|瑞士))", "", nm)
-    nm = re.sub(r"大学.*学院", "大学", nm)
-    tbl = copy.deepcopy(TBL)
-    tbl["hit_alias"] = tbl["alias"].map(lambda x: nm in set(x.split("+")))
-    res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | tbl.hit_alias)]
-    if res.empty:
-        return
-
-    return json.loads(res.to_json(orient="records"))[0]
-
-
-def is_good(nm):
-    global GOOD_SCH
-    nm = re.sub(r"[(（][^()（）]+[)）]", "", nm.lower())
-    nm = re.sub(r"[''`‘’“”,. &（）();；]+", "", nm)
-    return nm in GOOD_SCH
--- a/deepdoc/parser/resume/step_one.py
+++ b/deepdoc/parser/resume/step_one.py
@ -1,189 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import json
-from deepdoc.parser.resume.entities import degrees, regions, industries
-
-FIELDS = [
-"address STRING",
-"annual_salary int",
-"annual_salary_from int",
-"annual_salary_to int",
-"birth STRING",
-"card STRING",
-"certificate_obj string",
-"city STRING",
-"corporation_id int",
-"corporation_name STRING",
-"corporation_type STRING",
-"degree STRING",
-"discipline_name STRING",
-"education_obj string",
-"email STRING",
-"expect_annual_salary int",
-"expect_city_names string",
-"expect_industry_name STRING",
-"expect_position_name STRING",
-"expect_salary_from int",
-"expect_salary_to int",
-"expect_type STRING",
-"gender STRING",
-"industry_name STRING",
-"industry_names STRING",
-"is_deleted STRING",
-"is_fertility STRING",
-"is_house STRING",
-"is_management_experience STRING",
-"is_marital STRING",
-"is_oversea STRING",
-"language_obj string",
-"name STRING",
-"nation STRING",
-"phone STRING",
-"political_status STRING",
-"position_name STRING",
-"project_obj string",
-"responsibilities string",
-"salary_month int",
-"scale STRING",
-"school_name STRING",
-"self_remark string",
-"skill_obj string",
-"title_name STRING",
-"tob_resume_id STRING",
-"updated_at Timestamp",
-"wechat STRING",
-"work_obj string",
-"work_experience int",
-"work_start_time BIGINT"
-]
-
-def refactor(df):
-    def deal_obj(obj, k, kk):
-        if not isinstance(obj, type({})):
-            return ""
-        obj = obj.get(k, {})
-        if not isinstance(obj, type({})):
-            return ""
-        return obj.get(kk, "")
-
-    def loadjson(line):
-        try:
-            return json.loads(line)
-        except Exception:
-            pass
-        return {}
-
-    df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
-    df.fillna("", inplace=True)
-
-    clms = ["tob_resume_id", "updated_at"]
-
-    def extract(nms, cc=None):
-        nonlocal clms
-        clms.extend(nms)
-        for c in nms:
-            if cc:
-                df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
-            else:
-                df[c] = df["obj"].map(
-                    lambda x: json.dumps(
-                        x.get(
-                            c,
-                            {}),
-                        ensure_ascii=False) if isinstance(
-                        x,
-                        type(
-                            {})) and (
-                        isinstance(
-                            x.get(c),
-                            type(
-                                {})) or not x.get(c)) else str(x).replace(
-                                    "None",
-                        ""))
-
-    extract(["education", "work", "certificate", "project", "language",
-             "skill"])
-    extract(["wechat", "phone", "is_deleted",
-            "name", "tel", "email"], "contact")
-    extract(["nation", "expect_industry_name", "salary_month",
-             "industry_ids", "is_house", "birth", "annual_salary_from",
-             "annual_salary_to", "card",
-             "expect_salary_to", "expect_salary_from",
-             "expect_position_name", "gender", "city",
-             "is_fertility", "expect_city_names",
-             "political_status", "title_name", "expect_annual_salary",
-             "industry_name", "address", "position_name", "school_name",
-             "corporation_id",
-             "is_oversea", "responsibilities",
-             "work_start_time", "degree", "management_experience",
-             "expect_type", "corporation_type", "scale", "corporation_name",
-             "self_remark", "annual_salary", "work_experience",
-             "discipline_name", "marital", "updated_at"], "basic")
-
-    df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
-    df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
-    df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
-                                                                      str(x).split(",")]))
-    clms.append("industry_names")
-
-    def arr2str(a):
-        if not a:
-            return ""
-        if isinstance(a, list):
-            a = " ".join([str(i) for i in a])
-        return str(a).replace(",", " ")
-
-    df["expect_industry_name"] = df["expect_industry_name"].map(
-        lambda x: arr2str(x))
-    df["gender"] = df["gender"].map(
-        lambda x: "男" if x == 'M' else (
-            "女" if x == 'F' else ""))
-    for c in ["is_fertility", "is_oversea", "is_house",
-              "management_experience", "marital"]:
-        df[c] = df[c].map(
-            lambda x: '是' if x == 'Y' else (
-                '否' if x == 'N' else ""))
-    df["is_management_experience"] = df["management_experience"]
-    df["is_marital"] = df["marital"]
-    clms.extend(["is_management_experience", "is_marital"])
-
-    df.fillna("", inplace=True)
-    for i in range(len(df)):
-        if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
-            df.loc[i, "phone"] = df.loc[i, "tel"].strip()
-
-    for n in ["industry_ids", "management_experience", "marital", "tel"]:
-        for i in range(len(clms)):
-            if clms[i] == n:
-                del clms[i]
-                break
-
-    clms = list(set(clms))
-
-    df = df.reindex(sorted(clms), axis=1)
-    #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
-    for c in clms:
-        df[c] = df[c].map(
-            lambda s: str(s).replace(
-                "\t",
-                " ").replace(
-                "\n",
-                "\\n").replace(
-                "\r",
-                "\\n"))
-    # print(df.values.tolist())
-    return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))
--- a/deepdoc/parser/resume/step_two.py
+++ b/deepdoc/parser/resume/step_two.py
@ -1,696 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import re
-import copy
-import time
-import datetime
-import demjson3
-import traceback
-import signal
-import numpy as np
-from deepdoc.parser.resume.entities import degrees, schools, corporations
-from rag.nlp import rag_tokenizer, surname
-from xpinyin import Pinyin
-from contextlib import contextmanager
-
-
-class TimeoutException(Exception):
-    pass
-
-
-@contextmanager
-def time_limit(seconds):
-    def signal_handler(signum, frame):
-        raise TimeoutException("Timed out!")
-
-    signal.signal(signal.SIGALRM, signal_handler)
-    signal.alarm(seconds)
-    try:
-        yield
-    finally:
-        signal.alarm(0)
-
-
-ENV = None
-PY = Pinyin()
-
-
-def rmHtmlTag(line):
-    return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
-
-
-def highest_degree(dg):
-    if not dg:
-        return ""
-    if isinstance(dg, str):
-        dg = [dg]
-    m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
-    return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
-
-
-def forEdu(cv):
-    if not cv.get("education_obj"):
-        cv["integerity_flt"] *= 0.8
-        return cv
-
-    first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
-    edu_nst = []
-    edu_end_dt = ""
-    cv["school_rank_int"] = 1000000
-    for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
-        e = {}
-        if n.get("end_time"):
-            if n["end_time"] > edu_end_dt:
-                edu_end_dt = n["end_time"]
-            try:
-                dt = n["end_time"]
-                if re.match(r"[0-9]{9,}", dt):
-                    dt = turnTm2Dt(dt)
-                y, m, d = getYMD(dt)
-                ed_dt.append(str(y))
-                e["end_dt_kwd"] = str(y)
-            except Exception as e:
-                pass
-        if n.get("start_time"):
-            try:
-                dt = n["start_time"]
-                if re.match(r"[0-9]{9,}", dt):
-                    dt = turnTm2Dt(dt)
-                y, m, d = getYMD(dt)
-                st_dt.append(str(y))
-                e["start_dt_kwd"] = str(y)
-            except Exception:
-                pass
-
-        r = schools.select(n.get("school_name", ""))
-        if r:
-            if str(r.get("type", "")) == "1":
-                fea.append("211")
-            if str(r.get("type", "")) == "2":
-                fea.append("211")
-            if str(r.get("is_abroad", "")) == "1":
-                fea.append("留学")
-            if str(r.get("is_double_first", "")) == "1":
-                fea.append("双一流")
-            if str(r.get("is_985", "")) == "1":
-                fea.append("985")
-            if str(r.get("is_world_known", "")) == "1":
-                fea.append("海外知名")
-            if r.get("rank") and cv["school_rank_int"] > r["rank"]:
-                cv["school_rank_int"] = r["rank"]
-
-        if n.get("school_name") and isinstance(n["school_name"], str):
-            sch.append(re.sub(r"(211|985|重点大学|[,&;；-])", "", n["school_name"]))
-            e["sch_nm_kwd"] = sch[-1]
-        fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
-
-        if n.get("discipline_name") and isinstance(n["discipline_name"], str):
-            maj.append(n["discipline_name"])
-            e["major_kwd"] = n["discipline_name"]
-
-        if not n.get("degree") and "985" in fea and not first_fea:
-            n["degree"] = "1"
-
-        if n.get("degree"):
-            d = degrees.get_name(n["degree"])
-            if d:
-                e["degree_kwd"] = d
-            if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
-                d = "专升本"
-            if d:
-                deg.append(d)
-
-            # for first degree
-            if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
-                fdeg = [d]
-                if n.get("school_name"):
-                    fsch = [n["school_name"]]
-                if n.get("discipline_name"):
-                    fmaj = [n["discipline_name"]]
-                first_fea = copy.deepcopy(fea)
-
-        edu_nst.append(e)
-
-    cv["sch_rank_kwd"] = []
-    if cv["school_rank_int"] <= 20 \
-            or ("海外名校" in fea and cv["school_rank_int"] <= 200):
-        cv["sch_rank_kwd"].append("顶尖学校")
-    elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
-            or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
-                cv["school_rank_int"] > 200):
-        cv["sch_rank_kwd"].append("精英学校")
-    elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
-            or ("海外名校" in fea and cv["school_rank_int"] > 500):
-        cv["sch_rank_kwd"].append("优质学校")
-    else:
-        cv["sch_rank_kwd"].append("一般学校")
-
-    if edu_nst:
-        cv["edu_nst"] = edu_nst
-    if fea:
-        cv["edu_fea_kwd"] = list(set(fea))
-    if first_fea:
-        cv["edu_first_fea_kwd"] = list(set(first_fea))
-    if maj:
-        cv["major_kwd"] = maj
-    if fsch:
-        cv["first_school_name_kwd"] = fsch
-    if fdeg:
-        cv["first_degree_kwd"] = fdeg
-    if fmaj:
-        cv["first_major_kwd"] = fmaj
-    if st_dt:
-        cv["edu_start_kwd"] = st_dt
-    if ed_dt:
-        cv["edu_end_kwd"] = ed_dt
-    if ed_dt:
-        cv["edu_end_int"] = max([int(t) for t in ed_dt])
-    if deg:
-        if "本科" in deg and "专科" in deg:
-            deg.append("专升本")
-            deg = [d for d in deg if d != '本科']
-        cv["degree_kwd"] = deg
-        cv["highest_degree_kwd"] = highest_degree(deg)
-    if edu_end_dt:
-        try:
-            if re.match(r"[0-9]{9,}", edu_end_dt):
-                edu_end_dt = turnTm2Dt(edu_end_dt)
-            if edu_end_dt.strip("\n") == "至今":
-                edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
-            y, m, d = getYMD(edu_end_dt)
-            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
-        except Exception as e:
-            logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
-    if sch:
-        cv["school_name_kwd"] = sch
-        if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
-                or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
-                or not cv.get("degree_kwd"):
-            for c in sch:
-                if schools.is_good(c):
-                    if "tag_kwd" not in cv:
-                        cv["tag_kwd"] = []
-                    cv["tag_kwd"].append("好学校")
-                    cv["tag_kwd"].append("好学历")
-                    break
-        if (len(cv.get("degree_kwd", [])) >= 1 and \
-            "本科" in cv["degree_kwd"] and \
-            any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
-                or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
-                or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
-            if "tag_kwd" not in cv:
-                cv["tag_kwd"] = []
-            if "好学历" not in cv["tag_kwd"]:
-                cv["tag_kwd"].append("好学历")
-
-    if cv.get("major_kwd"):
-        cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
-    if cv.get("school_name_kwd"):
-        cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
-    if cv.get("first_school_name_kwd"):
-        cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
-    if cv.get("first_major_kwd"):
-        cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
-
-    return cv
-
-
-def forProj(cv):
-    if not cv.get("project_obj"):
-        return cv
-
-    pro_nms, desc = [], []
-    for i, n in enumerate(
-            sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
-                   reverse=True)):
-        if n.get("name"):
-            pro_nms.append(n["name"])
-        if n.get("describe"):
-            desc.append(str(n["describe"]))
-        if n.get("responsibilities"):
-            desc.append(str(n["responsibilities"]))
-        if n.get("achivement"):
-            desc.append(str(n["achivement"]))
-
-    if pro_nms:
-        # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
-        cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
-    if desc:
-        cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
-        cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
-
-    return cv
-
-
-def json_loads(line):
-    return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
-
-
-def forWork(cv):
-    if not cv.get("work_obj"):
-        cv["integerity_flt"] *= 0.7
-        return cv
-
-    flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
-            "industry_name", "subordinates_count"]
-    duas = []
-    scales = []
-    fea = {c: [] for c in flds}
-    latest_job_tm = ""
-    goodcorp = False
-    goodcorp_ = False
-    work_st_tm = ""
-    corp_tags = []
-    for i, n in enumerate(
-            sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
-                   reverse=True)):
-        if isinstance(n, str):
-            try:
-                n = json_loads(n)
-            except Exception:
-                continue
-
-        if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
-            work_st_tm = n["start_time"]
-        for c in flds:
-            if not n.get(c) or str(n[c]) == '0':
-                fea[c].append("")
-                continue
-            if c == "corporation_name":
-                n[c] = corporations.corpNorm(n[c], False)
-                if corporations.is_good(n[c]):
-                    if i == 0:
-                        goodcorp = True
-                    else:
-                        goodcorp_ = True
-                ct = corporations.corp_tag(n[c])
-                if i == 0:
-                    corp_tags.extend(ct)
-                elif ct and ct[0] != "软外":
-                    corp_tags.extend([f"{t}(曾)" for t in ct])
-
-            fea[c].append(rmHtmlTag(str(n[c]).lower()))
-
-        y, m, d = getYMD(n.get("start_time"))
-        if not y or not m:
-            continue
-        st = "%s-%02d-%02d" % (y, int(m), int(d))
-        latest_job_tm = st
-
-        y, m, d = getYMD(n.get("end_time"))
-        if (not y or not m) and i > 0:
-            continue
-        if not y or not m or int(y) > 2022:
-            y, m, d = getYMD(str(n.get("updated_at", "")))
-        if not y or not m:
-            continue
-        ed = "%s-%02d-%02d" % (y, int(m), int(d))
-
-        try:
-            duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
-        except Exception:
-            logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
-
-        if n.get("scale"):
-            r = re.search(r"^([0-9]+)", str(n["scale"]))
-            if r:
-                scales.append(int(r.group(1)))
-
-    if goodcorp:
-        if "tag_kwd" not in cv:
-            cv["tag_kwd"] = []
-        cv["tag_kwd"].append("好公司")
-    if goodcorp_:
-        if "tag_kwd" not in cv:
-            cv["tag_kwd"] = []
-        cv["tag_kwd"].append("好公司(曾)")
-
-    if corp_tags:
-        if "tag_kwd" not in cv:
-            cv["tag_kwd"] = []
-        cv["tag_kwd"].extend(corp_tags)
-        cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
-
-    if latest_job_tm:
-        cv["latest_job_dt"] = latest_job_tm
-    if fea["corporation_id"]:
-        cv["corporation_id"] = fea["corporation_id"]
-
-    if fea["position_name"]:
-        cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
-        cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
-        cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
-
-    if fea["industry_name"]:
-        cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
-        cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
-        cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
-
-    if fea["corporation_name"]:
-        cv["corporation_name_kwd"] = fea["corporation_name"][0]
-        cv["corp_nm_kwd"] = fea["corporation_name"]
-        cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
-        cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
-        cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
-
-    if fea["responsibilities"]:
-        cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
-        cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
-
-    if fea["subordinates_count"]:
-        fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
-                                                               re.match(r"[^0-9]+$", str(i))]
-    if fea["subordinates_count"]:
-        cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
-
-    if isinstance(cv.get("corporation_id"), int):
-        cv["corporation_id"] = [str(cv["corporation_id"])]
-    if not cv.get("corporation_id"):
-        cv["corporation_id"] = []
-    for i in cv.get("corporation_id", []):
-        cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
-
-    if work_st_tm:
-        try:
-            if re.match(r"[0-9]{9,}", work_st_tm):
-                work_st_tm = turnTm2Dt(work_st_tm)
-            y, m, d = getYMD(work_st_tm)
-            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
-        except Exception as e:
-            logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
-
-    cv["job_num_int"] = 0
-    if duas:
-        cv["dua_flt"] = np.mean(duas)
-        cv["cur_dua_int"] = duas[0]
-        cv["job_num_int"] = len(duas)
-    if scales:
-        cv["scale_flt"] = np.max(scales)
-    return cv
-
-
-def turnTm2Dt(b):
-    if not b:
-        return
-    b = str(b).strip()
-    if re.match(r"[0-9]{10,}", b):
-        b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
-    return b
-
-
-def getYMD(b):
-    y, m, d = "", "", "01"
-    if not b:
-        return (y, m, d)
-    b = turnTm2Dt(b)
-    if re.match(r"[0-9]{4}", b):
-        y = int(b[:4])
-    r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
-    if r:
-        m = r.group(1)
-    r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
-    if r:
-        d = r.group(1)
-    if not d or int(d) == 0 or int(d) > 31:
-        d = "1"
-    if not m or int(m) > 12 or int(m) < 1:
-        m = "1"
-    return (y, m, d)
-
-
-def birth(cv):
-    if not cv.get("birth"):
-        cv["integerity_flt"] *= 0.9
-        return cv
-    y, m, d = getYMD(cv["birth"])
-    if not m or not y:
-        return cv
-    b = "%s-%02d-%02d" % (y, int(m), int(d))
-    cv["birth_dt"] = b
-    cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
-
-    cv["age_int"] = datetime.datetime.now().year - int(y)
-    return cv
-
-
-def parse(cv):
-    for k in cv.keys():
-        if cv[k] == '\\N':
-            cv[k] = ''
-    # cv = cv.asDict()
-    tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
-               "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
-               "position_name", "school_name", "self_remark", "title_name"]
-    small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
-    kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
-               "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
-               "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
-    num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
-               "expect_salary_to", "salary_month"]
-
-    is_fld = [
-        ("is_fertility", "已育", "未育"),
-        ("is_house", "有房", "没房"),
-        ("is_management_experience", "有管理经验", "无管理经验"),
-        ("is_marital", "已婚", "未婚"),
-        ("is_oversea", "有海外经验", "无海外经验")
-    ]
-
-    rmkeys = []
-    for k in cv.keys():
-        if cv[k] is None:
-            rmkeys.append(k)
-        if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
-            rmkeys.append(k)
-    for k in rmkeys:
-        del cv[k]
-
-    integerity = 0.
-    flds_num = 0.
-
-    def hasValues(flds):
-        nonlocal integerity, flds_num
-        flds_num += len(flds)
-        for f in flds:
-            v = str(cv.get(f, ""))
-            if len(v) > 0 and v != '0' and v != '[]':
-                integerity += 1
-
-    hasValues(tks_fld)
-    hasValues(small_tks_fld)
-    hasValues(kwd_fld)
-    hasValues(num_fld)
-    cv["integerity_flt"] = integerity / flds_num
-
-    if cv.get("corporation_type"):
-        for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
-                     (r"[／/．·　<\(（]+.*", ""),
-                     (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
-                     (r".*(机关|事业).*", "机关"),
-                     (r".*(非盈利|Non-profit).*", "非盈利"),
-                     (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
-                     (r".*国有.*", "国企"),
-                     (r"[ （）\(\)人/·0-9-]+", ""),
-                     (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
-            cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
-        if len(cv["corporation_type"]) < 2:
-            del cv["corporation_type"]
-
-    if cv.get("political_status"):
-        for p, r in [
-            (r".*党员.*", "党员"),
-            (r".*(无党派|公民).*", "群众"),
-            (r".*团员.*", "团员")]:
-            cv["political_status"] = re.sub(p, r, cv["political_status"])
-        if not re.search(r"[党团群]", cv["political_status"]):
-            del cv["political_status"]
-
-    if cv.get("phone"):
-        cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
-
-    keys = list(cv.keys())
-    for k in keys:
-        # deal with json objects
-        if k.find("_obj") > 0:
-            try:
-                cv[k] = json_loads(cv[k])
-                cv[k] = [a for _, a in cv[k].items()]
-                nms = []
-                for n in cv[k]:
-                    if not isinstance(n, dict) or "name" not in n or not n.get("name"):
-                        continue
-                    n["name"] = re.sub(r"(（442）|\t )", "", n["name"]).strip().lower()
-                    if not n["name"]:
-                        continue
-                    nms.append(n["name"])
-                if nms:
-                    t = k[:-4]
-                    cv[f"{t}_kwd"] = nms
-                    cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
-            except Exception:
-                logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
-                cv[k] = []
-
-        # tokenize fields
-        if k in tks_fld:
-            cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
-            if k in small_tks_fld:
-                cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
-
-        # keyword fields
-        if k in kwd_fld:
-            cv[f"{k}_kwd"] = [n.lower()
-                                           for n in re.split(r"[\t,，；;. ]",
-                                                             re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1，\2", cv[k])
-                                                             ) if n]
-
-        if k in num_fld and cv.get(k):
-            cv[f"{k}_int"] = cv[k]
-
-    cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
-    # for name field
-    if cv.get("name"):
-        nm = re.sub(r"[\n——\-\(（\+].*", "", cv["name"].strip())
-        nm = re.sub(r"[ \t　]+", " ", nm)
-        if re.match(r"[a-zA-Z ]+$", nm):
-            if len(nm.split()) > 1:
-                cv["name"] = nm
-            else:
-                nm = ""
-        elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
-            nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
-        else:
-            nm = ""
-        cv["name"] = nm.strip()
-        name = cv["name"]
-
-        # name pingyin and its prefix
-        cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
-        cv["name_py_pref0_tks"] = ""
-        cv["name_py_pref_tks"] = ""
-        for py in PY.get_pinyins(nm[:20], ''):
-            for i in range(2, len(py) + 1):
-                cv["name_py_pref_tks"] += " " + py[:i]
-        for py in PY.get_pinyins(nm[:20], ' '):
-            py = py.split()
-            for i in range(1, len(py) + 1):
-                cv["name_py_pref0_tks"] += " " + "".join(py[:i])
-
-        cv["name_kwd"] = name
-        cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
-        cv["name_tks"] = (
-                rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
-        ) if name else ""
-    else:
-        cv["integerity_flt"] /= 2.
-
-    if cv.get("phone"):
-        r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
-        if not r:
-            cv["phone"] = ""
-        else:
-            cv["phone"] = r.group(1)
-
-    # deal with date  fields
-    if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
-        cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
-    else:
-        y, m, d = getYMD(str(cv.get("updated_at", "")))
-        if not y:
-            y = "2012"
-        if not m:
-            m = "01"
-        if not d:
-            d = "01"
-        cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
-        # long text tokenize
-
-    if cv.get("responsibilities"):
-        cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
-
-    # for yes or no field
-    fea = []
-    for f, y, n in is_fld:
-        if f not in cv:
-            continue
-        if cv[f] == '是':
-            fea.append(y)
-        if cv[f] == '否':
-            fea.append(n)
-
-    if fea:
-        cv["tag_kwd"] = fea
-
-    cv = forEdu(cv)
-    cv = forProj(cv)
-    cv = forWork(cv)
-    cv = birth(cv)
-
-    cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
-    for i in range(len(cv["corp_proj_sch_deg_kwd"])):
-        for j in cv.get("sch_rank_kwd", []):
-            cv["corp_proj_sch_deg_kwd"][i] += "+" + j
-    for i in range(len(cv["corp_proj_sch_deg_kwd"])):
-        if cv.get("highest_degree_kwd"):
-            cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
-
-    try:
-        if not cv.get("work_exp_flt") and cv.get("work_start_time"):
-            if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
-                cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
-                cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
-            elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
-                y, m, d = getYMD(str(cv["work_start_time"]))
-                cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
-                cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
-    except Exception as e:
-        logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
-    if "work_exp_flt" not in cv and cv.get("work_experience", 0):
-        cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
-
-    keys = list(cv.keys())
-    for k in keys:
-        if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
-            del cv[k]
-    for k in cv.keys():
-        if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
-            continue
-        cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
-    keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
-    for k in keys:
-        if cv[k] <= 0:
-            del cv[k]
-
-    cv["tob_resume_id"] = str(cv["tob_resume_id"])
-    cv["id"] = cv["tob_resume_id"]
-    logging.debug("CCCCCCCCCCCCCCC")
-
-    return dealWithInt64(cv)
-
-
-def dealWithInt64(d):
-    if isinstance(d, dict):
-        for n, v in d.items():
-            d[n] = dealWithInt64(v)
-
-    if isinstance(d, list):
-        d = [dealWithInt64(t) for t in d]
-
-    if isinstance(d, np.integer):
-        d = int(d)
-    return d
--- a/deepdoc/parser/txt_parser.py
+++ b/deepdoc/parser/txt_parser.py
@ -1,64 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import re
-
-from deepdoc.parser.utils import get_text
-from rag.nlp import num_tokens_from_string
-
-
-class RAGFlowTxtParser:
-    def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。；！？"):
-        txt = get_text(fnm, binary)
-        return self.parser_txt(txt, chunk_token_num, delimiter)
-
-    @classmethod
-    def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。；！？"):
-        if not isinstance(txt, str):
-            raise TypeError("txt type should be str!")
-        cks = [""]
-        tk_nums = [0]
-        delimiter = delimiter.encode('utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
-
-        def add_chunk(t):
-            nonlocal cks, tk_nums, delimiter
-            tnum = num_tokens_from_string(t)
-            if tk_nums[-1] > chunk_token_num:
-                cks.append(t)
-                tk_nums.append(tnum)
-            else:
-                cks[-1] += t
-                tk_nums[-1] += tnum
-
-        dels = []
-        s = 0
-        for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
-            f, t = m.span()
-            dels.append(m.group(1))
-            dels.extend(list(delimiter[s: f]))
-            s = t
-        if s < len(delimiter):
-            dels.extend(list(delimiter[s:]))
-        dels = [re.escape(d) for d in dels if d]
-        dels = [d for d in dels if d]
-        dels = "|".join(dels)
-        secs = re.split(r"(%s)" % dels, txt)
-        for sec in secs:
-            if re.match(f"^{dels}$", sec):
-                continue
-            add_chunk(sec)
-
-        return [[c, ""] for c in cks]
--- a/deepdoc/parser/utils.py
+++ b/deepdoc/parser/utils.py
@ -1,32 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-from rag.nlp import find_codec
-
-
-def get_text(fnm: str, binary=None) -> str:
-    txt = ""
-    if binary:
-        encoding = find_codec(binary)
-        txt = binary.decode(encoding, errors="ignore")
-    else:
-        with open(fnm, "r") as f:
-            while True:
-                line = f.readline()
-                if not line:
-                    break
-                txt += line
-    return txt
--- a/deepdoc/vision/init.py
+++ b/deepdoc/vision/init.py
@ -1,86 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import io
-import sys
-import threading
-import pdfplumber
-
-from .ocr import OCR
-from .recognizer import Recognizer
-from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
-from .table_structure_recognizer import TableStructureRecognizer
-
-
-LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
-if LOCK_KEY_pdfplumber not in sys.modules:
-    sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
-
-
-def init_in_out(args):
-    from PIL import Image
-    import os
-    import traceback
-    from api.utils.file_utils import traversal_files
-    images = []
-    outputs = []
-
-    if not os.path.exists(args.output_dir):
-        os.mkdir(args.output_dir)
-
-    def pdf_pages(fnm, zoomin=3):
-        nonlocal outputs, images
-        with sys.modules[LOCK_KEY_pdfplumber]:
-            pdf = pdfplumber.open(fnm)
-            images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
-                                enumerate(pdf.pages)]
-
-        for i, page in enumerate(images):
-            outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
-        pdf.close()
-
-    def images_and_outputs(fnm):
-        nonlocal outputs, images
-        if fnm.split(".")[-1].lower() == "pdf":
-            pdf_pages(fnm)
-            return
-        try:
-            fp = open(fnm, 'rb')
-            binary = fp.read()
-            fp.close()
-            images.append(Image.open(io.BytesIO(binary)).convert('RGB'))
-            outputs.append(os.path.split(fnm)[-1])
-        except Exception:
-            traceback.print_exc()
-
-    if os.path.isdir(args.inputs):
-        for fnm in traversal_files(args.inputs):
-            images_and_outputs(fnm)
-    else:
-        images_and_outputs(args.inputs)
-
-    for i in range(len(outputs)):
-        outputs[i] = os.path.join(args.output_dir, outputs[i])
-
-    return images, outputs
-
-
-__all__ = [
-    "OCR",
-    "Recognizer",
-    "LayoutRecognizer",
-    "TableStructureRecognizer",
-    "init_in_out",
-]
--- a/deepdoc/vision/layout_recognizer.py
+++ b/deepdoc/vision/layout_recognizer.py
@ -1,245 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import os
-import re
-from collections import Counter
-from copy import deepcopy
-
-import cv2
-import numpy as np
-from huggingface_hub import snapshot_download
-
-from api.utils.file_utils import get_project_base_directory
-from deepdoc.vision import Recognizer
-from deepdoc.vision.operators import nms
-
-
-class LayoutRecognizer(Recognizer):
-    labels = [
-        "_background_",
-        "Text",
-        "Title",
-        "Figure",
-        "Figure caption",
-        "Table",
-        "Table caption",
-        "Header",
-        "Footer",
-        "Reference",
-        "Equation",
-    ]
-
-    def __init__(self, domain):
-        try:
-            model_dir = os.path.join(
-                    get_project_base_directory(),
-                    "rag/res/deepdoc")
-            super().__init__(self.labels, domain, model_dir)
-        except Exception:
-            model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
-                                          local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
-                                          local_dir_use_symlinks=False)
-            super().__init__(self.labels, domain, model_dir)
-
-        self.garbage_layouts = ["footer", "header", "reference"]
-
-    def __call__(self, image_list, ocr_res, scale_factor=3,
-                 thr=0.2, batch_size=16, drop=True):
-        def __is_garbage(b):
-            patt = [r"^•+$", r"(版权归©|免责条款|地址[:：])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
-                    r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
-                    "(资料|数据)来源[:：]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
-                    "\\(cid *: *[0-9]+ *\\)"
-                    ]
-            return any([re.search(p, b["text"]) for p in patt])
-
-        layouts = super().__call__(image_list, thr, batch_size)
-        # save_results(image_list, layouts, self.labels, output_dir='output/', threshold=0.7)
-        assert len(image_list) == len(ocr_res)
-        # Tag layout type
-        boxes = []
-        assert len(image_list) == len(layouts)
-        garbages = {}
-        page_layout = []
-        for pn, lts in enumerate(layouts):
-            bxs = ocr_res[pn]
-            lts = [{"type": b["type"],
-                    "score": float(b["score"]),
-                    "x0": b["bbox"][0] / scale_factor, "x1": b["bbox"][2] / scale_factor,
-                    "top": b["bbox"][1] / scale_factor, "bottom": b["bbox"][-1] / scale_factor,
-                    "page_number": pn,
-                    } for b in lts if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts]
-            lts = self.sort_Y_firstly(lts, np.mean(
-                [lt["bottom"] - lt["top"] for lt in lts]) / 2)
-            lts = self.layouts_cleanup(bxs, lts)
-            page_layout.append(lts)
-
-            # Tag layout type, layouts are ready
-            def findLayout(ty):
-                nonlocal bxs, lts, self
-                lts_ = [lt for lt in lts if lt["type"] == ty]
-                i = 0
-                while i < len(bxs):
-                    if bxs[i].get("layout_type"):
-                        i += 1
-                        continue
-                    if __is_garbage(bxs[i]):
-                        bxs.pop(i)
-                        continue
-
-                    ii = self.find_overlapped_with_threashold(bxs[i], lts_,
-                                                              thr=0.4)
-                    if ii is None:  # belong to nothing
-                        bxs[i]["layout_type"] = ""
-                        i += 1
-                        continue
-                    lts_[ii]["visited"] = True
-                    keep_feats = [
-                        lts_[
-                            ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
-                        lts_[
-                            ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
-                    ]
-                    if drop and lts_[
-                            ii]["type"] in self.garbage_layouts and not any(keep_feats):
-                        if lts_[ii]["type"] not in garbages:
-                            garbages[lts_[ii]["type"]] = []
-                        garbages[lts_[ii]["type"]].append(bxs[i]["text"])
-                        bxs.pop(i)
-                        continue
-
-                    bxs[i]["layoutno"] = f"{ty}-{ii}"
-                    bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[
-                        ii]["type"] != "equation" else "figure"
-                    i += 1
-
-            for lt in ["footer", "header", "reference", "figure caption",
-                       "table caption", "title", "table", "text", "figure", "equation"]:
-                findLayout(lt)
-
-            # add box to figure layouts which has not text box
-            for i, lt in enumerate(
-                    [lt for lt in lts if lt["type"] in ["figure", "equation"]]):
-                if lt.get("visited"):
-                    continue
-                lt = deepcopy(lt)
-                del lt["type"]
-                lt["text"] = ""
-                lt["layout_type"] = "figure"
-                lt["layoutno"] = f"figure-{i}"
-                bxs.append(lt)
-
-            boxes.extend(bxs)
-
-        ocr_res = boxes
-
-        garbag_set = set()
-        for k in garbages.keys():
-            garbages[k] = Counter(garbages[k])
-            for g, c in garbages[k].items():
-                if c > 1:
-                    garbag_set.add(g)
-
-        ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
-        return ocr_res, page_layout
-
-    def forward(self, image_list, thr=0.7, batch_size=16):
-        return super().__call__(image_list, thr, batch_size)
-
-class LayoutRecognizer4YOLOv10(LayoutRecognizer):
-    labels = [
-        "title",
-        "Text",
-        "Reference",
-        "Figure",
-        "Figure caption",
-        "Table",
-        "Table caption",
-        "Table caption",
-        "Equation",
-        "Figure caption",
-    ]
-
-    def __init__(self, domain):
-        domain = "layout"
-        super().__init__(domain)
-        self.auto = False
-        self.scaleFill = False
-        self.scaleup = True
-        self.stride = 32
-        self.center = True
-
-    def preprocess(self, image_list):
-        inputs = []
-        new_shape = self.input_shape # height, width
-        for img in image_list:
-            shape = img.shape[:2]# current shape [height, width]
-            # Scale ratio (new / old)
-            r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-            # Compute padding
-            new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-            dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-            dw /= 2  # divide padding into 2 sides
-            dh /= 2
-            ww, hh = new_unpad
-            img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
-            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
-            top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
-            left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
-            img = cv2.copyMakeBorder(
-                img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
-            )  # add border
-            img /= 255.0
-            img = img.transpose(2, 0, 1)
-            img = img[np.newaxis, :, :, :].astype(np.float32)
-            inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})
-
-        return inputs
-
-    def postprocess(self, boxes, inputs, thr):
-        thr = 0.08
-        boxes = np.squeeze(boxes)
-        scores = boxes[:, 4]
-        boxes = boxes[scores > thr, :]
-        scores = scores[scores > thr]
-        if len(boxes) == 0:
-            return []
-        class_ids = boxes[:, -1].astype(int)
-        boxes = boxes[:, :4]
-        boxes[:, 0] -= inputs["scale_factor"][2]
-        boxes[:, 2] -= inputs["scale_factor"][2]
-        boxes[:, 1] -= inputs["scale_factor"][3]
-        boxes[:, 3] -= inputs["scale_factor"][3]
-        input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
-                                inputs["scale_factor"][1]])
-        boxes = np.multiply(boxes, input_shape, dtype=np.float32)
-
-        unique_class_ids = np.unique(class_ids)
-        indices = []
-        for class_id in unique_class_ids:
-            class_indices = np.where(class_ids == class_id)[0]
-            class_boxes = boxes[class_indices, :]
-            class_scores = scores[class_indices]
-            class_keep_boxes = nms(class_boxes, class_scores, 0.45)
-            indices.extend(class_indices[class_keep_boxes])
-
-        return [{
-            "type": self.label_list[class_ids[i]].lower(),
-            "bbox": [float(t) for t in boxes[i].tolist()],
-            "score": float(scores[i])
-        } for i in indices]
-
--- a/deepdoc/vision/ocr.py
+++ b/deepdoc/vision/ocr.py
@ -1,702 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import copy
-import time
-import os
-
-from huggingface_hub import snapshot_download
-
-from api.utils.file_utils import get_project_base_directory
-from .operators import *  # noqa: F403
-from . import operators
-import math
-import numpy as np
-import cv2
-import onnxruntime as ort
-
-from .postprocess import build_post_process
-
-loaded_models = {}
-
-def transform(data, ops=None):
-    """ transform """
-    if ops is None:
-        ops = []
-    for op in ops:
-        data = op(data)
-        if data is None:
-            return None
-    return data
-
-
-def create_operators(op_param_list, global_config=None):
-    """
-    create operators based on the config
-
-    Args:
-        params(list): a dict list, used to create some operators
-    """
-    assert isinstance(
-        op_param_list, list), ('operator config should be a list')
-    ops = []
-    for operator in op_param_list:
-        assert isinstance(operator,
-                          dict) and len(operator) == 1, "yaml format error"
-        op_name = list(operator)[0]
-        param = {} if operator[op_name] is None else operator[op_name]
-        if global_config is not None:
-            param.update(global_config)
-        op = getattr(operators, op_name)(**param)
-        ops.append(op)
-    return ops
-
-
-def load_model(model_dir, nm):
-    model_file_path = os.path.join(model_dir, nm + ".onnx")
-    global loaded_models
-    loaded_model = loaded_models.get(model_file_path)
-    if loaded_model:
-        logging.info(f"load_model {model_file_path} reuses cached model")
-        return loaded_model
-
-    if not os.path.exists(model_file_path):
-        raise ValueError("not find model file path {}".format(
-            model_file_path))
-
-    def cuda_is_available():
-        try:
-            import torch
-            if torch.cuda.is_available():
-                return True
-        except Exception:
-            return False
-        return False
-
-    options = ort.SessionOptions()
-    options.enable_cpu_mem_arena = False
-    options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
-    options.intra_op_num_threads = 2
-    options.inter_op_num_threads = 2
-
-    # https://github.com/microsoft/onnxruntime/issues/9509#issuecomment-951546580
-    # Shrink GPU memory after execution
-    run_options = ort.RunOptions()
-    if cuda_is_available():
-        cuda_provider_options = {
-            "device_id": 0, # Use specific GPU
-            "gpu_mem_limit": 512 * 1024 * 1024, # Limit gpu memory
-            "arena_extend_strategy": "kNextPowerOfTwo",  # gpu memory allocation strategy
-        }
-        sess = ort.InferenceSession(
-            model_file_path,
-            options=options,
-            providers=['CUDAExecutionProvider'],
-            provider_options=[cuda_provider_options]
-            )
-        run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "gpu:0")
-        logging.info(f"load_model {model_file_path} uses GPU")
-    else:
-        sess = ort.InferenceSession(
-            model_file_path,
-            options=options,
-            providers=['CPUExecutionProvider'])
-        run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "cpu")
-        logging.info(f"load_model {model_file_path} uses CPU")
-    loaded_model = (sess, run_options)
-    loaded_models[model_file_path] = loaded_model
-    return loaded_model
-
-
-class TextRecognizer:
-    """
-    文本识别器类，用于识别检测到的文本区域中的具体文字内容
-    
-    该类使用基于CTC(Connectionist Temporal Classification)的文本识别模型，
-    能够将图像中的文本区域转换为文字内容
-    """
-    def __init__(self, model_dir):
-        """
-        初始化文本识别器
-        
-        参数:
-            model_dir: 模型文件所在目录
-        """
-        self.rec_image_shape = [int(v) for v in "3, 48, 320".split(",")]
-        self.rec_batch_num = 16
-        postprocess_params = {
-            'name': 'CTCLabelDecode',
-            "character_dict_path": os.path.join(model_dir, "ocr.res"),
-            "use_space_char": True
-        }
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.run_options = load_model(model_dir, 'rec')
-        self.input_tensor = self.predictor.get_inputs()[0]
-
-    def resize_norm_img(self, img, max_wh_ratio):
-        """
-        调整图像大小并进行归一化处理，保持宽高比
-        
-        参数:
-            img: 输入图像
-            max_wh_ratio: 最大宽高比
-            
-        返回:
-            处理后的图像张量
-        """
-        imgC, imgH, imgW = self.rec_image_shape
-
-        assert imgC == img.shape[2]
-        imgW = int((imgH * max_wh_ratio))
-        w = self.input_tensor.shape[3:][0]
-        if isinstance(w, str):
-            pass
-        elif w is not None and w > 0:
-            imgW = w
-        h, w = img.shape[:2]
-        ratio = w / float(h)
-        if math.ceil(imgH * ratio) > imgW:
-            resized_w = imgW
-        else:
-            resized_w = int(math.ceil(imgH * ratio))
-
-        resized_image = cv2.resize(img, (resized_w, imgH))
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-        padding_im[:, :, 0:resized_w] = resized_image
-        return padding_im
-
-    def resize_norm_img_vl(self, img, image_shape):
-
-        imgC, imgH, imgW = image_shape
-        img = img[:, :, ::-1]  # bgr2rgb
-        resized_image = cv2.resize(
-            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        return resized_image
-
-    def resize_norm_img_srn(self, img, image_shape):
-        imgC, imgH, imgW = image_shape
-
-        img_black = np.zeros((imgH, imgW))
-        im_hei = img.shape[0]
-        im_wid = img.shape[1]
-
-        if im_wid <= im_hei * 1:
-            img_new = cv2.resize(img, (imgH * 1, imgH))
-        elif im_wid <= im_hei * 2:
-            img_new = cv2.resize(img, (imgH * 2, imgH))
-        elif im_wid <= im_hei * 3:
-            img_new = cv2.resize(img, (imgH * 3, imgH))
-        else:
-            img_new = cv2.resize(img, (imgW, imgH))
-
-        img_np = np.asarray(img_new)
-        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
-        img_black[:, 0:img_np.shape[1]] = img_np
-        img_black = img_black[:, :, np.newaxis]
-
-        row, col, c = img_black.shape
-        c = 1
-
-        return np.reshape(img_black, (c, row, col)).astype(np.float32)
-
-    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
-
-        imgC, imgH, imgW = image_shape
-        feature_dim = int((imgH / 8) * (imgW / 8))
-
-        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
-            (feature_dim, 1)).astype('int64')
-        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
-            (max_text_length, 1)).astype('int64')
-
-        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
-        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias1 = np.tile(
-            gsrm_slf_attn_bias1,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
-            [-1, 1, max_text_length, max_text_length])
-        gsrm_slf_attn_bias2 = np.tile(
-            gsrm_slf_attn_bias2,
-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
-
-        encoder_word_pos = encoder_word_pos[np.newaxis, :]
-        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
-
-        return [
-            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-            gsrm_slf_attn_bias2
-        ]
-
-    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
-        norm_img = self.resize_norm_img_srn(img, image_shape)
-        norm_img = norm_img[np.newaxis, :]
-
-        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
-            self.srn_other_inputs(image_shape, num_heads, max_text_length)
-
-        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
-        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
-        encoder_word_pos = encoder_word_pos.astype(np.int64)
-        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
-
-        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
-                gsrm_slf_attn_bias2)
-
-    def resize_norm_img_sar(self, img, image_shape,
-                            width_downsample_ratio=0.25):
-        imgC, imgH, imgW_min, imgW_max = image_shape
-        h = img.shape[0]
-        w = img.shape[1]
-        valid_ratio = 1.0
-        # make sure new_width is an integral multiple of width_divisor.
-        width_divisor = int(1 / width_downsample_ratio)
-        # resize
-        ratio = w / float(h)
-        resize_w = math.ceil(imgH * ratio)
-        if resize_w % width_divisor != 0:
-            resize_w = round(resize_w / width_divisor) * width_divisor
-        if imgW_min is not None:
-            resize_w = max(imgW_min, resize_w)
-        if imgW_max is not None:
-            valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
-            resize_w = min(imgW_max, resize_w)
-        resized_image = cv2.resize(img, (resize_w, imgH))
-        resized_image = resized_image.astype('float32')
-        # norm
-        if image_shape[0] == 1:
-            resized_image = resized_image / 255
-            resized_image = resized_image[np.newaxis, :]
-        else:
-            resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        resize_shape = resized_image.shape
-        padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
-        padding_im[:, :, 0:resize_w] = resized_image
-        pad_shape = padding_im.shape
-
-        return padding_im, resize_shape, pad_shape, valid_ratio
-
-    def resize_norm_img_spin(self, img):
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        # return padding_im
-        img = cv2.resize(img, tuple([100, 32]), cv2.INTER_CUBIC)
-        img = np.array(img, np.float32)
-        img = np.expand_dims(img, -1)
-        img = img.transpose((2, 0, 1))
-        mean = [127.5]
-        std = [127.5]
-        mean = np.array(mean, dtype=np.float32)
-        std = np.array(std, dtype=np.float32)
-        mean = np.float32(mean.reshape(1, -1))
-        stdinv = 1 / np.float32(std.reshape(1, -1))
-        img -= mean
-        img *= stdinv
-        return img
-
-    def resize_norm_img_svtr(self, img, image_shape):
-
-        imgC, imgH, imgW = image_shape
-        resized_image = cv2.resize(
-            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-        resized_image -= 0.5
-        resized_image /= 0.5
-        return resized_image
-
-    def resize_norm_img_abinet(self, img, image_shape):
-
-        imgC, imgH, imgW = image_shape
-
-        resized_image = cv2.resize(
-            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
-        resized_image = resized_image.astype('float32')
-        resized_image = resized_image / 255.
-
-        mean = np.array([0.485, 0.456, 0.406])
-        std = np.array([0.229, 0.224, 0.225])
-        resized_image = (
-            resized_image - mean[None, None, ...]) / std[None, None, ...]
-        resized_image = resized_image.transpose((2, 0, 1))
-        resized_image = resized_image.astype('float32')
-
-        return resized_image
-
-    def norm_img_can(self, img, image_shape):
-
-        img = cv2.cvtColor(
-            img, cv2.COLOR_BGR2GRAY)  # CAN only predict gray scale image
-
-        if self.rec_image_shape[0] == 1:
-            h, w = img.shape
-            _, imgH, imgW = self.rec_image_shape
-            if h < imgH or w < imgW:
-                padding_h = max(imgH - h, 0)
-                padding_w = max(imgW - w, 0)
-                img_padded = np.pad(img, ((0, padding_h), (0, padding_w)),
-                                    'constant',
-                                    constant_values=(255))
-                img = img_padded
-
-        img = np.expand_dims(img, 0) / 255.0  # h,w,c -> c,h,w
-        img = img.astype('float32')
-
-        return img
-
-    def __call__(self, img_list):
-        img_num = len(img_list)
-        # Calculate the aspect ratio of all text bars
-        width_list = []
-        for img in img_list:
-            width_list.append(img.shape[1] / float(img.shape[0]))
-        # Sorting can speed up the recognition process
-        indices = np.argsort(np.array(width_list))
-        rec_res = [['', 0.0]] * img_num
-        batch_num = self.rec_batch_num
-        st = time.time()
-
-        for beg_img_no in range(0, img_num, batch_num):
-            end_img_no = min(img_num, beg_img_no + batch_num)
-            norm_img_batch = []
-            imgC, imgH, imgW = self.rec_image_shape[:3]
-            max_wh_ratio = imgW / imgH
-            # max_wh_ratio = 0
-            for ino in range(beg_img_no, end_img_no):
-                h, w = img_list[indices[ino]].shape[0:2]
-                wh_ratio = w * 1.0 / h
-                max_wh_ratio = max(max_wh_ratio, wh_ratio)
-            for ino in range(beg_img_no, end_img_no):
-                norm_img = self.resize_norm_img(img_list[indices[ino]],
-                                                max_wh_ratio)
-                norm_img = norm_img[np.newaxis, :]
-                norm_img_batch.append(norm_img)
-            norm_img_batch = np.concatenate(norm_img_batch)
-            norm_img_batch = norm_img_batch.copy()
-
-            input_dict = {}
-            input_dict[self.input_tensor.name] = norm_img_batch
-            for i in range(100000):
-                try:
-                    outputs = self.predictor.run(None, input_dict, self.run_options)
-                    break
-                except Exception as e:
-                    if i >= 3:
-                        raise e
-                    time.sleep(5)
-            preds = outputs[0]
-            rec_result = self.postprocess_op(preds)
-            for rno in range(len(rec_result)):
-                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
-
-        return rec_res, time.time() - st
-
-
-class TextDetector:
-    """
-    文本检测器类，用于检测图像中的文本区域
-    
-    该类使用基于DB(Differentiable Binarization)的文本检测模型，
-    能够准确定位图像中的文本区域，并返回文本框的坐标信息
-    """
-    def __init__(self, model_dir):
-        pre_process_list = [{
-            'DetResizeForTest': {
-                'limit_side_len': 960,
-                'limit_type': "max",
-            }
-        }, {
-            'NormalizeImage': {
-                'std': [0.229, 0.224, 0.225],
-                'mean': [0.485, 0.456, 0.406],
-                'scale': '1./255.',
-                'order': 'hwc'
-            }
-        }, {
-            'ToCHWImage': None
-        }, {
-            'KeepKeys': {
-                'keep_keys': ['image', 'shape']
-            }
-        }]
-        postprocess_params = {"name": "DBPostProcess", "thresh": 0.3, "box_thresh": 0.5, "max_candidates": 1000,
-                              "unclip_ratio": 1.5, "use_dilation": False, "score_mode": "fast", "box_type": "quad"}
-
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.run_options = load_model(model_dir, 'det')
-        self.input_tensor = self.predictor.get_inputs()[0]
-
-        img_h, img_w = self.input_tensor.shape[2:]
-        if isinstance(img_h, str) or isinstance(img_w, str):
-            pass
-        elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
-            pre_process_list[0] = {
-                'DetResizeForTest': {
-                    'image_shape': [img_h, img_w]
-                }
-            }
-        self.preprocess_op = create_operators(pre_process_list)
-
-    def order_points_clockwise(self, pts):
-        rect = np.zeros((4, 2), dtype="float32")
-        s = pts.sum(axis=1)
-        rect[0] = pts[np.argmin(s)]
-        rect[2] = pts[np.argmax(s)]
-        tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
-        diff = np.diff(np.array(tmp), axis=1)
-        rect[1] = tmp[np.argmin(diff)]
-        rect[3] = tmp[np.argmax(diff)]
-        return rect
-
-    def clip_det_res(self, points, img_height, img_width):
-        for pno in range(points.shape[0]):
-            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
-            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
-        return points
-
-    def filter_tag_det_res(self, dt_boxes, image_shape):
-        img_height, img_width = image_shape[0:2]
-        dt_boxes_new = []
-        for box in dt_boxes:
-            if isinstance(box, list):
-                box = np.array(box)
-            box = self.order_points_clockwise(box)
-            box = self.clip_det_res(box, img_height, img_width)
-            rect_width = int(np.linalg.norm(box[0] - box[1]))
-            rect_height = int(np.linalg.norm(box[0] - box[3]))
-            if rect_width <= 3 or rect_height <= 3:
-                continue
-            dt_boxes_new.append(box)
-        dt_boxes = np.array(dt_boxes_new)
-        return dt_boxes
-
-    def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
-        img_height, img_width = image_shape[0:2]
-        dt_boxes_new = []
-        for box in dt_boxes:
-            if isinstance(box, list):
-                box = np.array(box)
-            box = self.clip_det_res(box, img_height, img_width)
-            dt_boxes_new.append(box)
-        dt_boxes = np.array(dt_boxes_new)
-        return dt_boxes
-
-    def __call__(self, img):
-        ori_im = img.copy()
-        data = {'image': img}
-
-        st = time.time()
-        data = transform(data, self.preprocess_op)
-        img, shape_list = data
-        if img is None:
-            return None, 0
-        img = np.expand_dims(img, axis=0)
-        shape_list = np.expand_dims(shape_list, axis=0)
-        img = img.copy()
-        input_dict = {}
-        input_dict[self.input_tensor.name] = img
-        for i in range(100000):
-            try:
-                outputs = self.predictor.run(None, input_dict, self.run_options)
-                break
-            except Exception as e:
-                if i >= 3:
-                    raise e
-                time.sleep(5)
-
-        post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
-        dt_boxes = post_result[0]['points']
-        dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
-
-        return dt_boxes, time.time() - st
-
-
-class OCR:
-    def __init__(self, model_dir=None):
-        """
-        If you have trouble downloading HuggingFace models, -_^ this might help!!
-
-        For Linux:
-        export HF_ENDPOINT=https://hf-mirror.com
-
-        For Windows:
-        Good luck
-        ^_-
-
-        """
-        if not model_dir:
-            try:
-                model_dir = os.path.join(
-                        get_project_base_directory(),
-                        "rag/res/deepdoc")
-                self.text_detector = TextDetector(model_dir)
-                self.text_recognizer = TextRecognizer(model_dir)
-            except Exception:
-                model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
-                                              local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
-                                              local_dir_use_symlinks=False)
-                self.text_detector = TextDetector(model_dir)
-                self.text_recognizer = TextRecognizer(model_dir)
-
-        self.drop_score = 0.5
-        self.crop_image_res_index = 0
-
-    def get_rotate_crop_image(self, img, points):
-        '''
-        img_height, img_width = img.shape[0:2]
-        left = int(np.min(points[:, 0]))
-        right = int(np.max(points[:, 0]))
-        top = int(np.min(points[:, 1]))
-        bottom = int(np.max(points[:, 1]))
-        img_crop = img[top:bottom, left:right, :].copy()
-        points[:, 0] = points[:, 0] - left
-        points[:, 1] = points[:, 1] - top
-        '''
-        assert len(points) == 4, "shape of points must be 4*2"
-        img_crop_width = int(
-            max(
-                np.linalg.norm(points[0] - points[1]),
-                np.linalg.norm(points[2] - points[3])))
-        img_crop_height = int(
-            max(
-                np.linalg.norm(points[0] - points[3]),
-                np.linalg.norm(points[1] - points[2])))
-        pts_std = np.float32([[0, 0], [img_crop_width, 0],
-                              [img_crop_width, img_crop_height],
-                              [0, img_crop_height]])
-        M = cv2.getPerspectiveTransform(points, pts_std)
-        dst_img = cv2.warpPerspective(
-            img,
-            M, (img_crop_width, img_crop_height),
-            borderMode=cv2.BORDER_REPLICATE,
-            flags=cv2.INTER_CUBIC)
-        dst_img_height, dst_img_width = dst_img.shape[0:2]
-        if dst_img_height * 1.0 / dst_img_width >= 1.5:
-            dst_img = np.rot90(dst_img)
-        return dst_img
-
-    def sorted_boxes(self, dt_boxes):
-        """
-        Sort text boxes in order from top to bottom, left to right
-        args:
-            dt_boxes(array):detected text boxes with shape [4, 2]
-        return:
-            sorted boxes(array) with shape [4, 2]
-        """
-        num_boxes = dt_boxes.shape[0]
-        sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
-        _boxes = list(sorted_boxes)
-
-        for i in range(num_boxes - 1):
-            for j in range(i, -1, -1):
-                if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
-                        (_boxes[j + 1][0][0] < _boxes[j][0][0]):
-                    tmp = _boxes[j]
-                    _boxes[j] = _boxes[j + 1]
-                    _boxes[j + 1] = tmp
-                else:
-                    break
-        return _boxes
-
-    def detect(self, img):
-        time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
-
-        if img is None:
-            return None, None, time_dict
-
-        start = time.time()
-        dt_boxes, elapse = self.text_detector(img)
-        time_dict['det'] = elapse
-
-        if dt_boxes is None:
-            end = time.time()
-            time_dict['all'] = end - start
-            return None, None, time_dict
-
-        return zip(self.sorted_boxes(dt_boxes), [
-                   ("", 0) for _ in range(len(dt_boxes))])
-
-    def recognize(self, ori_im, box):
-        img_crop = self.get_rotate_crop_image(ori_im, box)
-
-        rec_res, elapse = self.text_recognizer([img_crop])
-        text, score = rec_res[0]
-        if score < self.drop_score:
-            return ""
-        return text
-
-    def recognize_batch(self, img_list):
-        rec_res, elapse = self.text_recognizer(img_list)
-        texts = []
-        for i in range(len(rec_res)):
-            text, score = rec_res[i]
-            if score < self.drop_score:
-                text = ""
-            texts.append(text)
-        return texts
-
-    def __call__(self, img, cls=True):
-        time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
-
-        if img is None:
-            return None, None, time_dict
-
-        start = time.time()
-        ori_im = img.copy()
-        dt_boxes, elapse = self.text_detector(img)
-        time_dict['det'] = elapse
-
-        if dt_boxes is None:
-            end = time.time()
-            time_dict['all'] = end - start
-            return None, None, time_dict
-
-        img_crop_list = []
-
-        dt_boxes = self.sorted_boxes(dt_boxes)
-
-        for bno in range(len(dt_boxes)):
-            tmp_box = copy.deepcopy(dt_boxes[bno])
-            img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
-            img_crop_list.append(img_crop)
-
-        rec_res, elapse = self.text_recognizer(img_crop_list)
-
-        time_dict['rec'] = elapse
-
-        filter_boxes, filter_rec_res = [], []
-        for box, rec_result in zip(dt_boxes, rec_res):
-            text, score = rec_result
-            if score >= self.drop_score:
-                filter_boxes.append(box)
-                filter_rec_res.append(rec_result)
-        end = time.time()
-        time_dict['all'] = end - start
-
-        # for bno in range(len(img_crop_list)):
-        #    print(f"{bno}, {rec_res[bno]}")
-
-        return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))
--- a/deepdoc/vision/operators.py
+++ b/deepdoc/vision/operators.py
@ -1,725 +0,0 @@
-#
-#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import sys
-import six
-import cv2
-import numpy as np
-import math
-from PIL import Image
-
-
-class DecodeImage:
-    """ decode image """
-
-    def __init__(self,
-                 img_mode='RGB',
-                 channel_first=False,
-                 ignore_orientation=False,
-                 **kwargs):
-        self.img_mode = img_mode
-        self.channel_first = channel_first
-        self.ignore_orientation = ignore_orientation
-
-    def __call__(self, data):
-        img = data['image']
-        if six.PY2:
-            assert isinstance(img, str) and len(
-                img) > 0, "invalid input 'img' in DecodeImage"
-        else:
-            assert isinstance(img, bytes) and len(
-                img) > 0, "invalid input 'img' in DecodeImage"
-        img = np.frombuffer(img, dtype='uint8')
-        if self.ignore_orientation:
-            img = cv2.imdecode(img, cv2.IMREAD_IGNORE_ORIENTATION |
-                               cv2.IMREAD_COLOR)
-        else:
-            img = cv2.imdecode(img, 1)
-        if img is None:
-            return None
-        if self.img_mode == 'GRAY':
-            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-        elif self.img_mode == 'RGB':
-            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
-                img.shape)
-            img = img[:, :, ::-1]
-
-        if self.channel_first:
-            img = img.transpose((2, 0, 1))
-
-        data['image'] = img
-        return data
-
-
-class StandardizeImag:
-    """normalize image
-    Args:
-        mean (list): im - mean
-        std (list): im / std
-        is_scale (bool): whether need im / 255
-        norm_type (str): type in ['mean_std', 'none']
-    """
-
-    def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
-        self.mean = mean
-        self.std = std
-        self.is_scale = is_scale
-        self.norm_type = norm_type
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        im = im.astype(np.float32, copy=False)
-        if self.is_scale:
-            scale = 1.0 / 255.0
-            im *= scale
-
-        if self.norm_type == 'mean_std':
-            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
-            std = np.array(self.std)[np.newaxis, np.newaxis, :]
-            im -= mean
-            im /= std
-        return im, im_info
-
-
-class NormalizeImage:
-    """ normalize image such as subtract mean, divide std
-    """
-
-    def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
-        if isinstance(scale, str):
-            scale = eval(scale)
-        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
-        mean = mean if mean is not None else [0.485, 0.456, 0.406]
-        std = std if std is not None else [0.229, 0.224, 0.225]
-
-        shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
-        self.mean = np.array(mean).reshape(shape).astype('float32')
-        self.std = np.array(std).reshape(shape).astype('float32')
-
-    def __call__(self, data):
-        img = data['image']
-        from PIL import Image
-        if isinstance(img, Image.Image):
-            img = np.array(img)
-        assert isinstance(img,
-                          np.ndarray), "invalid input 'img' in NormalizeImage"
-        data['image'] = (
-            img.astype('float32') * self.scale - self.mean) / self.std
-        return data
-
-
-class ToCHWImage:
-    """ convert hwc image to chw image
-    """
-
-    def __init__(self, **kwargs):
-        pass
-
-    def __call__(self, data):
-        img = data['image']
-        from PIL import Image
-        if isinstance(img, Image.Image):
-            img = np.array(img)
-        data['image'] = img.transpose((2, 0, 1))
-        return data
-
-
-class KeepKeys:
-    def __init__(self, keep_keys, **kwargs):
-        self.keep_keys = keep_keys
-
-    def __call__(self, data):
-        data_list = []
-        for key in self.keep_keys:
-            data_list.append(data[key])
-        return data_list
-
-
-class Pad:
-    def __init__(self, size=None, size_div=32, **kwargs):
-        if size is not None and not isinstance(size, (int, list, tuple)):
-            raise TypeError("Type of target_size is invalid. Now is {}".format(
-                type(size)))
-        if isinstance(size, int):
-            size = [size, size]
-        self.size = size
-        self.size_div = size_div
-
-    def __call__(self, data):
-
-        img = data['image']
-        img_h, img_w = img.shape[0], img.shape[1]
-        if self.size:
-            resize_h2, resize_w2 = self.size
-            assert (
-                img_h < resize_h2 and img_w < resize_w2
-            ), '(h, w) of target size should be greater than (img_h, img_w)'
-        else:
-            resize_h2 = max(
-                int(math.ceil(img.shape[0] / self.size_div) * self.size_div),
-                self.size_div)
-            resize_w2 = max(
-                int(math.ceil(img.shape[1] / self.size_div) * self.size_div),
-                self.size_div)
-        img = cv2.copyMakeBorder(
-            img,
-            0,
-            resize_h2 - img_h,
-            0,
-            resize_w2 - img_w,
-            cv2.BORDER_CONSTANT,
-            value=0)
-        data['image'] = img
-        return data
-
-
-class LinearResize:
-    """resize image by target_size and max_size
-    Args:
-        target_size (int): the target size of image
-        keep_ratio (bool): whether keep_ratio or not, default true
-        interp (int): method of resize
-    """
-
-    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
-        if isinstance(target_size, int):
-            target_size = [target_size, target_size]
-        self.target_size = target_size
-        self.keep_ratio = keep_ratio
-        self.interp = interp
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        assert len(self.target_size) == 2
-        assert self.target_size[0] > 0 and self.target_size[1] > 0
-        _im_channel = im.shape[2]
-        im_scale_y, im_scale_x = self.generate_scale(im)
-        im = cv2.resize(
-            im,
-            None,
-            None,
-            fx=im_scale_x,
-            fy=im_scale_y,
-            interpolation=self.interp)
-        im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
-        im_info['scale_factor'] = np.array(
-            [im_scale_y, im_scale_x]).astype('float32')
-        return im, im_info
-
-    def generate_scale(self, im):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-        Returns:
-            im_scale_x: the resize ratio of X
-            im_scale_y: the resize ratio of Y
-        """
-        origin_shape = im.shape[:2]
-        _im_c = im.shape[2]
-        if self.keep_ratio:
-            im_size_min = np.min(origin_shape)
-            im_size_max = np.max(origin_shape)
-            target_size_min = np.min(self.target_size)
-            target_size_max = np.max(self.target_size)
-            im_scale = float(target_size_min) / float(im_size_min)
-            if np.round(im_scale * im_size_max) > target_size_max:
-                im_scale = float(target_size_max) / float(im_size_max)
-            im_scale_x = im_scale
-            im_scale_y = im_scale
-        else:
-            resize_h, resize_w = self.target_size
-            im_scale_y = resize_h / float(origin_shape[0])
-            im_scale_x = resize_w / float(origin_shape[1])
-        return im_scale_y, im_scale_x
-
-
-class Resize:
-    def __init__(self, size=(640, 640), **kwargs):
-        self.size = size
-
-    def resize_image(self, img):
-        resize_h, resize_w = self.size
-        ori_h, ori_w = img.shape[:2]  # (h, w, c)
-        ratio_h = float(resize_h) / ori_h
-        ratio_w = float(resize_w) / ori_w
-        img = cv2.resize(img, (int(resize_w), int(resize_h)))
-        return img, [ratio_h, ratio_w]
-
-    def __call__(self, data):
-        img = data['image']
-        if 'polys' in data:
-            text_polys = data['polys']
-
-        img_resize, [ratio_h, ratio_w] = self.resize_image(img)
-        if 'polys' in data:
-            new_boxes = []
-            for box in text_polys:
-                new_box = []
-                for cord in box:
-                    new_box.append([cord[0] * ratio_w, cord[1] * ratio_h])
-                new_boxes.append(new_box)
-            data['polys'] = np.array(new_boxes, dtype=np.float32)
-        data['image'] = img_resize
-        return data
-
-
-class DetResizeForTest:
-    def __init__(self, **kwargs):
-        super(DetResizeForTest, self).__init__()
-        self.resize_type = 0
-        self.keep_ratio = False
-        if 'image_shape' in kwargs:
-            self.image_shape = kwargs['image_shape']
-            self.resize_type = 1
-            if 'keep_ratio' in kwargs:
-                self.keep_ratio = kwargs['keep_ratio']
-        elif 'limit_side_len' in kwargs:
-            self.limit_side_len = kwargs['limit_side_len']
-            self.limit_type = kwargs.get('limit_type', 'min')
-        elif 'resize_long' in kwargs:
-            self.resize_type = 2
-            self.resize_long = kwargs.get('resize_long', 960)
-        else:
-            self.limit_side_len = 736
-            self.limit_type = 'min'
-
-    def __call__(self, data):
-        img = data['image']
-        src_h, src_w, _ = img.shape
-        if sum([src_h, src_w]) < 64:
-            img = self.image_padding(img)
-
-        if self.resize_type == 0:
-            # img, shape = self.resize_image_type0(img)
-            img, [ratio_h, ratio_w] = self.resize_image_type0(img)
-        elif self.resize_type == 2:
-            img, [ratio_h, ratio_w] = self.resize_image_type2(img)
-        else:
-            # img, shape = self.resize_image_type1(img)
-            img, [ratio_h, ratio_w] = self.resize_image_type1(img)
-        data['image'] = img
-        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
-        return data
-
-    def image_padding(self, im, value=0):
-        h, w, c = im.shape
-        im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value
-        im_pad[:h, :w, :] = im
-        return im_pad
-
-    def resize_image_type1(self, img):
-        resize_h, resize_w = self.image_shape
-        ori_h, ori_w = img.shape[:2]  # (h, w, c)
-        if self.keep_ratio is True:
-            resize_w = ori_w * resize_h / ori_h
-            N = math.ceil(resize_w / 32)
-            resize_w = N * 32
-        ratio_h = float(resize_h) / ori_h
-        ratio_w = float(resize_w) / ori_w
-        img = cv2.resize(img, (int(resize_w), int(resize_h)))
-        # return img, np.array([ori_h, ori_w])
-        return img, [ratio_h, ratio_w]
-
-    def resize_image_type0(self, img):
-        """
-        resize image to a size multiple of 32 which is required by the network
-        args:
-            img(array): array with shape [h, w, c]
-        return(tuple):
-            img, (ratio_h, ratio_w)
-        """
-        limit_side_len = self.limit_side_len
-        h, w, c = img.shape
-
-        # limit the max side
-        if self.limit_type == 'max':
-            if max(h, w) > limit_side_len:
-                if h > w:
-                    ratio = float(limit_side_len) / h
-                else:
-                    ratio = float(limit_side_len) / w
-            else:
-                ratio = 1.
-        elif self.limit_type == 'min':
-            if min(h, w) < limit_side_len:
-                if h < w:
-                    ratio = float(limit_side_len) / h
-                else:
-                    ratio = float(limit_side_len) / w
-            else:
-                ratio = 1.
-        elif self.limit_type == 'resize_long':
-            ratio = float(limit_side_len) / max(h, w)
-        else:
-            raise Exception('not support limit type, image ')
-        resize_h = int(h * ratio)
-        resize_w = int(w * ratio)
-
-        resize_h = max(int(round(resize_h / 32) * 32), 32)
-        resize_w = max(int(round(resize_w / 32) * 32), 32)
-
-        try:
-            if int(resize_w) <= 0 or int(resize_h) <= 0:
-                return None, (None, None)
-            img = cv2.resize(img, (int(resize_w), int(resize_h)))
-        except BaseException:
-            logging.exception("{} {} {}".format(img.shape, resize_w, resize_h))
-            sys.exit(0)
-        ratio_h = resize_h / float(h)
-        ratio_w = resize_w / float(w)
-        return img, [ratio_h, ratio_w]
-
-    def resize_image_type2(self, img):
-        h, w, _ = img.shape
-
-        resize_w = w
-        resize_h = h
-
-        if resize_h > resize_w:
-            ratio = float(self.resize_long) / resize_h
-        else:
-            ratio = float(self.resize_long) / resize_w
-
-        resize_h = int(resize_h * ratio)
-        resize_w = int(resize_w * ratio)
-
-        max_stride = 128
-        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
-        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
-        img = cv2.resize(img, (int(resize_w), int(resize_h)))
-        ratio_h = resize_h / float(h)
-        ratio_w = resize_w / float(w)
-
-        return img, [ratio_h, ratio_w]
-
-
-class E2EResizeForTest:
-    def __init__(self, **kwargs):
-        super(E2EResizeForTest, self).__init__()
-        self.max_side_len = kwargs['max_side_len']
-        self.valid_set = kwargs['valid_set']
-
-    def __call__(self, data):
-        img = data['image']
-        src_h, src_w, _ = img.shape
-        if self.valid_set == 'totaltext':
-            im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext(
-                img, max_side_len=self.max_side_len)
-        else:
-            im_resized, (ratio_h, ratio_w) = self.resize_image(
-                img, max_side_len=self.max_side_len)
-        data['image'] = im_resized
-        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
-        return data
-
-    def resize_image_for_totaltext(self, im, max_side_len=512):
-        h, w, _ = im.shape
-        resize_w = w
-        resize_h = h
-        ratio = 1.25
-        if h * ratio > max_side_len:
-            ratio = float(max_side_len) / resize_h
-        resize_h = int(resize_h * ratio)
-        resize_w = int(resize_w * ratio)
-
-        max_stride = 128
-        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
-        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
-        im = cv2.resize(im, (int(resize_w), int(resize_h)))
-        ratio_h = resize_h / float(h)
-        ratio_w = resize_w / float(w)
-        return im, (ratio_h, ratio_w)
-
-    def resize_image(self, im, max_side_len=512):
-        """
-        resize image to a size multiple of max_stride which is required by the network
-        :param im: the resized image
-        :param max_side_len: limit of max image size to avoid out of memory in gpu
-        :return: the resized image and the resize ratio
-        """
-        h, w, _ = im.shape
-
-        resize_w = w
-        resize_h = h
-
-        # Fix the longer side
-        if resize_h > resize_w:
-            ratio = float(max_side_len) / resize_h
-        else:
-            ratio = float(max_side_len) / resize_w
-
-        resize_h = int(resize_h * ratio)
-        resize_w = int(resize_w * ratio)
-
-        max_stride = 128
-        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
-        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
-        im = cv2.resize(im, (int(resize_w), int(resize_h)))
-        ratio_h = resize_h / float(h)
-        ratio_w = resize_w / float(w)
-
-        return im, (ratio_h, ratio_w)
-
-
-class KieResize:
-    def __init__(self, **kwargs):
-        super(KieResize, self).__init__()
-        self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[
-            'img_scale'][1]
-
-    def __call__(self, data):
-        img = data['image']
-        points = data['points']
-        src_h, src_w, _ = img.shape
-        im_resized, scale_factor, [ratio_h, ratio_w
-                                   ], [new_h, new_w] = self.resize_image(img)
-        resize_points = self.resize_boxes(img, points, scale_factor)
-        data['ori_image'] = img
-        data['ori_boxes'] = points
-        data['points'] = resize_points
-        data['image'] = im_resized
-        data['shape'] = np.array([new_h, new_w])
-        return data
-
-    def resize_image(self, img):
-        norm_img = np.zeros([1024, 1024, 3], dtype='float32')
-        scale = [512, 1024]
-        h, w = img.shape[:2]
-        max_long_edge = max(scale)
-        max_short_edge = min(scale)
-        scale_factor = min(max_long_edge / max(h, w),
-                           max_short_edge / min(h, w))
-        resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float(
-            scale_factor) + 0.5)
-        max_stride = 32
-        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
-        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
-        im = cv2.resize(img, (resize_w, resize_h))
-        new_h, new_w = im.shape[:2]
-        w_scale = new_w / w
-        h_scale = new_h / h
-        scale_factor = np.array(
-            [w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
-        norm_img[:new_h, :new_w, :] = im
-        return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w]
-
-    def resize_boxes(self, im, points, scale_factor):
-        points = points * scale_factor
-        img_shape = im.shape[:2]
-        points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1])
-        points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0])
-        return points
-
-
-class SRResize:
-    def __init__(self,
-                 imgH=32,
-                 imgW=128,
-                 down_sample_scale=4,
-                 keep_ratio=False,
-                 min_ratio=1,
-                 mask=False,
-                 infer_mode=False,
-                 **kwargs):
-        self.imgH = imgH
-        self.imgW = imgW
-        self.keep_ratio = keep_ratio
-        self.min_ratio = min_ratio
-        self.down_sample_scale = down_sample_scale
-        self.mask = mask
-        self.infer_mode = infer_mode
-
-    def __call__(self, data):
-        imgH = self.imgH
-        imgW = self.imgW
-        images_lr = data["image_lr"]
-        transform2 = ResizeNormalize(
-            (imgW // self.down_sample_scale, imgH // self.down_sample_scale))
-        images_lr = transform2(images_lr)
-        data["img_lr"] = images_lr
-        if self.infer_mode:
-            return data
-
-        images_HR = data["image_hr"]
-        _label_strs = data["label"]
-        transform = ResizeNormalize((imgW, imgH))
-        images_HR = transform(images_HR)
-        data["img_hr"] = images_HR
-        return data
-
-
-class ResizeNormalize:
-    def __init__(self, size, interpolation=Image.BICUBIC):
-        self.size = size
-        self.interpolation = interpolation
-
-    def __call__(self, img):
-        img = img.resize(self.size, self.interpolation)
-        img_numpy = np.array(img).astype("float32")
-        img_numpy = img_numpy.transpose((2, 0, 1)) / 255
-        return img_numpy
-
-
-class GrayImageChannelFormat:
-    """
-    format gray scale image's channel: (3,h,w) -> (1,h,w)
-    Args:
-        inverse: inverse gray image
-    """
-
-    def __init__(self, inverse=False, **kwargs):
-        self.inverse = inverse
-
-    def __call__(self, data):
-        img = data['image']
-        img_single_channel = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        img_expanded = np.expand_dims(img_single_channel, 0)
-
-        if self.inverse:
-            data['image'] = np.abs(img_expanded - 1)
-        else:
-            data['image'] = img_expanded
-
-        data['src_image'] = img
-        return data
-
-
-class Permute:
-    """permute image
-    Args:
-        to_bgr (bool): whether convert RGB to BGR
-        channel_first (bool): whether convert HWC to CHW
-    """
-
-    def __init__(self, ):
-        super(Permute, self).__init__()
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        im = im.transpose((2, 0, 1)).copy()
-        return im, im_info
-
-
-class PadStride:
-    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
-    Args:
-        stride (bool): model with FPN need image shape % stride == 0
-    """
-
-    def __init__(self, stride=0):
-        self.coarsest_stride = stride
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        coarsest_stride = self.coarsest_stride
-        if coarsest_stride <= 0:
-            return im, im_info
-        im_c, im_h, im_w = im.shape
-        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
-        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
-        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
-        padding_im[:, :im_h, :im_w] = im
-        return padding_im, im_info
-
-
-def decode_image(im_file, im_info):
-    """read rgb image
-    Args:
-        im_file (str|np.ndarray): input can be image path or np.ndarray
-        im_info (dict): info of image
-    Returns:
-        im (np.ndarray):  processed image (np.ndarray)
-        im_info (dict): info of processed image
-    """
-    if isinstance(im_file, str):
-        with open(im_file, 'rb') as f:
-            im_read = f.read()
-        data = np.frombuffer(im_read, dtype='uint8')
-        im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
-        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
-    else:
-        im = im_file
-    im_info['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
-    im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32)
-    return im, im_info
-
-
-def preprocess(im, preprocess_ops):
-    # process image by preprocess_ops
-    im_info = {
-        'scale_factor': np.array(
-            [1., 1.], dtype=np.float32),
-        'im_shape': None,
-    }
-    im, im_info = decode_image(im, im_info)
-    for operator in preprocess_ops:
-        im, im_info = operator(im, im_info)
-    return im, im_info
-
-
-def nms(bboxes, scores, iou_thresh):
-    import numpy as np
-    x1 = bboxes[:, 0]
-    y1 = bboxes[:, 1]
-    x2 = bboxes[:, 2]
-    y2 = bboxes[:, 3]
-    areas = (y2 - y1) * (x2 - x1)
-
-    indices = []
-    index = scores.argsort()[::-1]
-    while index.size > 0:
-        i = index[0]
-        indices.append(i)
-        x11 = np.maximum(x1[i], x1[index[1:]])
-        y11 = np.maximum(y1[i], y1[index[1:]])
-        x22 = np.minimum(x2[i], x2[index[1:]])
-        y22 = np.minimum(y2[i], y2[index[1:]])
-        w = np.maximum(0, x22 - x11 + 1)
-        h = np.maximum(0, y22 - y11 + 1)
-        overlaps = w * h
-        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
-        idx = np.where(ious <= iou_thresh)[0]
-        index = index[idx + 1]
-    return indices
--- a/deepdoc/vision/postprocess.py
+++ b/deepdoc/vision/postprocess.py
@ -1,370 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import copy
-import re
-import numpy as np
-import cv2
-from shapely.geometry import Polygon
-import pyclipper
-
-
-def build_post_process(config, global_config=None):
-    support_dict = {'DBPostProcess': DBPostProcess, 'CTCLabelDecode': CTCLabelDecode}
-
-    config = copy.deepcopy(config)
-    module_name = config.pop('name')
-    if module_name == "None":
-        return
-    if global_config is not None:
-        config.update(global_config)
-    module_class = support_dict.get(module_name)
-    if module_class is None:
-        raise ValueError(
-            'post process only support {}'.format(list(support_dict)))
-    return module_class(**config)
-
-
-class DBPostProcess:
-    """
-    The post process for Differentiable Binarization (DB).
-    """
-
-    def __init__(self,
-                 thresh=0.3,
-                 box_thresh=0.7,
-                 max_candidates=1000,
-                 unclip_ratio=2.0,
-                 use_dilation=False,
-                 score_mode="fast",
-                 box_type='quad',
-                 **kwargs):
-        self.thresh = thresh
-        self.box_thresh = box_thresh
-        self.max_candidates = max_candidates
-        self.unclip_ratio = unclip_ratio
-        self.min_size = 3
-        self.score_mode = score_mode
-        self.box_type = box_type
-        assert score_mode in [
-            "slow", "fast"
-        ], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
-
-        self.dilation_kernel = None if not use_dilation else np.array(
-            [[1, 1], [1, 1]])
-
-    def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
-        '''
-        _bitmap: single map with shape (1, H, W),
-            whose values are binarized as {0, 1}
-        '''
-
-        bitmap = _bitmap
-        height, width = bitmap.shape
-
-        boxes = []
-        scores = []
-
-        contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8),
-                                       cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
-
-        for contour in contours[:self.max_candidates]:
-            epsilon = 0.002 * cv2.arcLength(contour, True)
-            approx = cv2.approxPolyDP(contour, epsilon, True)
-            points = approx.reshape((-1, 2))
-            if points.shape[0] < 4:
-                continue
-
-            score = self.box_score_fast(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:
-                continue
-
-            if points.shape[0] > 2:
-                box = self.unclip(points, self.unclip_ratio)
-                if len(box) > 1:
-                    continue
-            else:
-                continue
-            box = box.reshape(-1, 2)
-
-            _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
-            if sside < self.min_size + 2:
-                continue
-
-            box = np.array(box)
-            box[:, 0] = np.clip(
-                np.round(box[:, 0] / width * dest_width), 0, dest_width)
-            box[:, 1] = np.clip(
-                np.round(box[:, 1] / height * dest_height), 0, dest_height)
-            boxes.append(box.tolist())
-            scores.append(score)
-        return boxes, scores
-
-    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
-        '''
-        _bitmap: single map with shape (1, H, W),
-                whose values are binarized as {0, 1}
-        '''
-
-        bitmap = _bitmap
-        height, width = bitmap.shape
-
-        outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
-                                cv2.CHAIN_APPROX_SIMPLE)
-        if len(outs) == 3:
-            _img, contours, _ = outs[0], outs[1], outs[2]
-        elif len(outs) == 2:
-            contours, _ = outs[0], outs[1]
-
-        num_contours = min(len(contours), self.max_candidates)
-
-        boxes = []
-        scores = []
-        for index in range(num_contours):
-            contour = contours[index]
-            points, sside = self.get_mini_boxes(contour)
-            if sside < self.min_size:
-                continue
-            points = np.array(points)
-            if self.score_mode == "fast":
-                score = self.box_score_fast(pred, points.reshape(-1, 2))
-            else:
-                score = self.box_score_slow(pred, contour)
-            if self.box_thresh > score:
-                continue
-
-            box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2)
-            box, sside = self.get_mini_boxes(box)
-            if sside < self.min_size + 2:
-                continue
-            box = np.array(box)
-
-            box[:, 0] = np.clip(
-                np.round(box[:, 0] / width * dest_width), 0, dest_width)
-            box[:, 1] = np.clip(
-                np.round(box[:, 1] / height * dest_height), 0, dest_height)
-            boxes.append(box.astype("int32"))
-            scores.append(score)
-        return np.array(boxes, dtype="int32"), scores
-
-    def unclip(self, box, unclip_ratio):
-        poly = Polygon(box)
-        distance = poly.area * unclip_ratio / poly.length
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        expanded = np.array(offset.Execute(distance))
-        return expanded
-
-    def get_mini_boxes(self, contour):
-        bounding_box = cv2.minAreaRect(contour)
-        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
-
-        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
-        if points[1][1] > points[0][1]:
-            index_1 = 0
-            index_4 = 1
-        else:
-            index_1 = 1
-            index_4 = 0
-        if points[3][1] > points[2][1]:
-            index_2 = 2
-            index_3 = 3
-        else:
-            index_2 = 3
-            index_3 = 2
-
-        box = [
-            points[index_1], points[index_2], points[index_3], points[index_4]
-        ]
-        return box, min(bounding_box[1])
-
-    def box_score_fast(self, bitmap, _box):
-        '''
-        box_score_fast: use bbox mean score as the mean score
-        '''
-        h, w = bitmap.shape[:2]
-        box = _box.copy()
-        xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1)
-        xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1)
-        ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1)
-        ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1)
-
-        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
-        box[:, 0] = box[:, 0] - xmin
-        box[:, 1] = box[:, 1] - ymin
-        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1)
-        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
-
-    def box_score_slow(self, bitmap, contour):
-        '''
-        box_score_slow: use polyon mean score as the mean score
-        '''
-        h, w = bitmap.shape[:2]
-        contour = contour.copy()
-        contour = np.reshape(contour, (-1, 2))
-
-        xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
-        xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
-        ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
-        ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
-
-        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
-
-        contour[:, 0] = contour[:, 0] - xmin
-        contour[:, 1] = contour[:, 1] - ymin
-
-        cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1)
-        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
-
-    def __call__(self, outs_dict, shape_list):
-        pred = outs_dict['maps']
-        if not isinstance(pred, np.ndarray):
-            pred = pred.numpy()
-        pred = pred[:, 0, :, :]
-        segmentation = pred > self.thresh
-
-        boxes_batch = []
-        for batch_index in range(pred.shape[0]):
-            src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
-            if self.dilation_kernel is not None:
-                mask = cv2.dilate(
-                    np.array(segmentation[batch_index]).astype(np.uint8),
-                    self.dilation_kernel)
-            else:
-                mask = segmentation[batch_index]
-            if self.box_type == 'poly':
-                boxes, scores = self.polygons_from_bitmap(pred[batch_index],
-                                                          mask, src_w, src_h)
-            elif self.box_type == 'quad':
-                boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
-                                                       src_w, src_h)
-            else:
-                raise ValueError(
-                    "box_type can only be one of ['quad', 'poly']")
-
-            boxes_batch.append({'points': boxes})
-        return boxes_batch
-
-
-class BaseRecLabelDecode:
-    """ Convert between text-label and text-index """
-
-    def __init__(self, character_dict_path=None, use_space_char=False):
-        self.beg_str = "sos"
-        self.end_str = "eos"
-        self.reverse = False
-        self.character_str = []
-
-        if character_dict_path is None:
-            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
-            dict_character = list(self.character_str)
-        else:
-            with open(character_dict_path, "rb") as fin:
-                lines = fin.readlines()
-                for line in lines:
-                    line = line.decode('utf-8').strip("\n").strip("\r\n")
-                    self.character_str.append(line)
-            if use_space_char:
-                self.character_str.append(" ")
-            dict_character = list(self.character_str)
-            if 'arabic' in character_dict_path:
-                self.reverse = True
-
-        dict_character = self.add_special_char(dict_character)
-        self.dict = {}
-        for i, char in enumerate(dict_character):
-            self.dict[char] = i
-        self.character = dict_character
-
-    def pred_reverse(self, pred):
-        pred_re = []
-        c_current = ''
-        for c in pred:
-            if not bool(re.search('[a-zA-Z0-9 :*./%+-]', c)):
-                if c_current != '':
-                    pred_re.append(c_current)
-                pred_re.append(c)
-                c_current = ''
-            else:
-                c_current += c
-        if c_current != '':
-            pred_re.append(c_current)
-
-        return ''.join(pred_re[::-1])
-
-    def add_special_char(self, dict_character):
-        return dict_character
-
-    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
-        """ convert text-index into text-label. """
-        result_list = []
-        ignored_tokens = self.get_ignored_tokens()
-        batch_size = len(text_index)
-        for batch_idx in range(batch_size):
-            selection = np.ones(len(text_index[batch_idx]), dtype=bool)
-            if is_remove_duplicate:
-                selection[1:] = text_index[batch_idx][1:] != text_index[
-                    batch_idx][:-1]
-            for ignored_token in ignored_tokens:
-                selection &= text_index[batch_idx] != ignored_token
-
-            char_list = [
-                self.character[text_id]
-                for text_id in text_index[batch_idx][selection]
-            ]
-            if text_prob is not None:
-                conf_list = text_prob[batch_idx][selection]
-            else:
-                conf_list = [1] * len(selection)
-            if len(conf_list) == 0:
-                conf_list = [0]
-
-            text = ''.join(char_list)
-
-            if self.reverse:  # for arabic rec
-                text = self.pred_reverse(text)
-
-            result_list.append((text, np.mean(conf_list).tolist()))
-        return result_list
-
-    def get_ignored_tokens(self):
-        return [0]  # for ctc blank
-
-
-class CTCLabelDecode(BaseRecLabelDecode):
-    """ Convert between text-label and text-index """
-
-    def __init__(self, character_dict_path=None, use_space_char=False,
-                 **kwargs):
-        super(CTCLabelDecode, self).__init__(character_dict_path,
-                                             use_space_char)
-
-    def __call__(self, preds, label=None, *args, **kwargs):
-        if isinstance(preds, tuple) or isinstance(preds, list):
-            preds = preds[-1]
-        if not isinstance(preds, np.ndarray):
-            preds = preds.numpy()
-        preds_idx = preds.argmax(axis=2)
-        preds_prob = preds.max(axis=2)
-        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
-        if label is None:
-            return text
-        label = self.decode(label)
-        return text, label
-
-    def add_special_char(self, dict_character):
-        dict_character = ['blank'] + dict_character
-        return dict_character
--- a/deepdoc/vision/recognizer.py
+++ b/deepdoc/vision/recognizer.py
@ -1,435 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import os
-import math
-import numpy as np
-import cv2
-from functools import cmp_to_key
-
-
-from api.utils.file_utils import get_project_base_directory
-from .operators import *  # noqa: F403
-from .operators import preprocess
-from . import operators
-from .ocr import load_model
-
-class Recognizer:
-    def __init__(self, label_list, task_name, model_dir=None):
-        """
-        If you have trouble downloading HuggingFace models, -_^ this might help!!
-
-        For Linux:
-        export HF_ENDPOINT=https://hf-mirror.com
-
-        For Windows:
-        Good luck
-        ^_-
-
-        """
-        if not model_dir:
-            model_dir = os.path.join(
-                        get_project_base_directory(),
-                        "rag/res/deepdoc")
-        self.ort_sess, self.run_options = load_model(model_dir, task_name)
-        self.input_names = [node.name for node in self.ort_sess.get_inputs()]
-        self.output_names = [node.name for node in self.ort_sess.get_outputs()]
-        self.input_shape = self.ort_sess.get_inputs()[0].shape[2:4]
-        self.label_list = label_list
-
-    @staticmethod
-    def sort_Y_firstly(arr, threashold):
-        def cmp(c1, c2):
-            diff = c1["top"] - c2["top"]
-            if abs(diff) < threashold:
-                diff = c1["x0"] - c2["x0"]
-            return diff
-        arr = sorted(arr, key=cmp_to_key(cmp))
-        return arr
-
-    @staticmethod
-    def sort_X_firstly(arr, threashold):
-        def cmp(c1, c2):
-            diff = c1["x0"] - c2["x0"]
-            if abs(diff) < threashold:
-                diff = c1["top"] - c2["top"]
-            return diff
-        arr = sorted(arr, key=cmp_to_key(cmp))
-        return arr
-
-    @staticmethod
-    def sort_C_firstly(arr, thr=0):
-        # sort using y1 first and then x1
-        # sorted(arr, key=lambda r: (r["x0"], r["top"]))
-        arr = Recognizer.sort_X_firstly(arr, thr)
-        for i in range(len(arr) - 1):
-            for j in range(i, -1, -1):
-                # restore the order using th
-                if "C" not in arr[j] or "C" not in arr[j + 1]:
-                    continue
-                if arr[j + 1]["C"] < arr[j]["C"] \
-                        or (
-                        arr[j + 1]["C"] == arr[j]["C"]
-                        and arr[j + 1]["top"] < arr[j]["top"]
-                ):
-                    tmp = arr[j]
-                    arr[j] = arr[j + 1]
-                    arr[j + 1] = tmp
-        return arr
-
-    @staticmethod
-    def sort_R_firstly(arr, thr=0):
-        # sort using y1 first and then x1
-        # sorted(arr, key=lambda r: (r["top"], r["x0"]))
-        arr = Recognizer.sort_Y_firstly(arr, thr)
-        for i in range(len(arr) - 1):
-            for j in range(i, -1, -1):
-                if "R" not in arr[j] or "R" not in arr[j + 1]:
-                    continue
-                if arr[j + 1]["R"] < arr[j]["R"] \
-                        or (
-                        arr[j + 1]["R"] == arr[j]["R"]
-                        and arr[j + 1]["x0"] < arr[j]["x0"]
-                ):
-                    tmp = arr[j]
-                    arr[j] = arr[j + 1]
-                    arr[j + 1] = tmp
-        return arr
-
-    @staticmethod
-    def overlapped_area(a, b, ratio=True):
-        tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"]
-        if b["x0"] > x1 or b["x1"] < x0:
-            return 0
-        if b["bottom"] < tp or b["top"] > btm:
-            return 0
-        x0_ = max(b["x0"], x0)
-        x1_ = min(b["x1"], x1)
-        assert x0_ <= x1_, "Bbox mismatch! T:{},B:{},X0:{},X1:{} ==> {}".format(
-            tp, btm, x0, x1, b)
-        tp_ = max(b["top"], tp)
-        btm_ = min(b["bottom"], btm)
-        assert tp_ <= btm_, "Bbox mismatch! T:{},B:{},X0:{},X1:{} => {}".format(
-            tp, btm, x0, x1, b)
-        ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
-                                           x0 != 0 and btm - tp != 0 else 0
-        if ov > 0 and ratio:
-            ov /= (x1 - x0) * (btm - tp)
-        return ov
-
-    @staticmethod
-    def layouts_cleanup(boxes, layouts, far=2, thr=0.7):
-        def notOverlapped(a, b):
-            return any([a["x1"] < b["x0"],
-                        a["x0"] > b["x1"],
-                        a["bottom"] < b["top"],
-                        a["top"] > b["bottom"]])
-
-        i = 0
-        while i + 1 < len(layouts):
-            j = i + 1
-            while j < min(i + far, len(layouts)) \
-                    and (layouts[i].get("type", "") != layouts[j].get("type", "")
-                         or notOverlapped(layouts[i], layouts[j])):
-                j += 1
-            if j >= min(i + far, len(layouts)):
-                i += 1
-                continue
-            if Recognizer.overlapped_area(layouts[i], layouts[j]) < thr \
-                    and Recognizer.overlapped_area(layouts[j], layouts[i]) < thr:
-                i += 1
-                continue
-
-            if layouts[i].get("score") and layouts[j].get("score"):
-                if layouts[i]["score"] > layouts[j]["score"]:
-                    layouts.pop(j)
-                else:
-                    layouts.pop(i)
-                continue
-
-            area_i, area_i_1 = 0, 0
-            for b in boxes:
-                if not notOverlapped(b, layouts[i]):
-                    area_i += Recognizer.overlapped_area(b, layouts[i], False)
-                if not notOverlapped(b, layouts[j]):
-                    area_i_1 += Recognizer.overlapped_area(b, layouts[j], False)
-
-            if area_i > area_i_1:
-                layouts.pop(j)
-            else:
-                layouts.pop(i)
-
-        return layouts
-
-    def create_inputs(self, imgs, im_info):
-        """generate input for different model type
-        Args:
-            imgs (list(numpy)): list of images (np.ndarray)
-            im_info (list(dict)): list of image info
-        Returns:
-            inputs (dict): input of model
-        """
-        inputs = {}
-
-        im_shape = []
-        scale_factor = []
-        if len(imgs) == 1:
-            inputs['image'] = np.array((imgs[0],)).astype('float32')
-            inputs['im_shape'] = np.array(
-                (im_info[0]['im_shape'],)).astype('float32')
-            inputs['scale_factor'] = np.array(
-                (im_info[0]['scale_factor'],)).astype('float32')
-            return inputs
-
-        for e in im_info:
-            im_shape.append(np.array((e['im_shape'],)).astype('float32'))
-            scale_factor.append(np.array((e['scale_factor'],)).astype('float32'))
-
-        inputs['im_shape'] = np.concatenate(im_shape, axis=0)
-        inputs['scale_factor'] = np.concatenate(scale_factor, axis=0)
-
-        imgs_shape = [[e.shape[1], e.shape[2]] for e in imgs]
-        max_shape_h = max([e[0] for e in imgs_shape])
-        max_shape_w = max([e[1] for e in imgs_shape])
-        padding_imgs = []
-        for img in imgs:
-            im_c, im_h, im_w = img.shape[:]
-            padding_im = np.zeros(
-                (im_c, max_shape_h, max_shape_w), dtype=np.float32)
-            padding_im[:, :im_h, :im_w] = img
-            padding_imgs.append(padding_im)
-        inputs['image'] = np.stack(padding_imgs, axis=0)
-        return inputs
-
-    @staticmethod
-    def find_overlapped(box, boxes_sorted_by_y, naive=False):
-        if not boxes_sorted_by_y:
-            return
-        bxs = boxes_sorted_by_y
-        s, e, ii = 0, len(bxs), 0
-        while s < e and not naive:
-            ii = (e + s) // 2
-            pv = bxs[ii]
-            if box["bottom"] < pv["top"]:
-                e = ii
-                continue
-            if box["top"] > pv["bottom"]:
-                s = ii + 1
-                continue
-            break
-        while s < ii:
-            if box["top"] > bxs[s]["bottom"]:
-                s += 1
-            break
-        while e - 1 > ii:
-            if box["bottom"] < bxs[e - 1]["top"]:
-                e -= 1
-            break
-
-        max_overlaped_i, max_overlaped = None, 0
-        for i in range(s, e):
-            ov = Recognizer.overlapped_area(bxs[i], box)
-            if ov <= max_overlaped:
-                continue
-            max_overlaped_i = i
-            max_overlaped = ov
-
-        return max_overlaped_i
-
-    @staticmethod
-    def find_horizontally_tightest_fit(box, boxes):
-        if not boxes:
-            return
-        min_dis, min_i = 1000000, None
-        for i,b in enumerate(boxes):
-            if box.get("layoutno", "0") != b.get("layoutno", "0"):
-                continue
-            dis = min(abs(box["x0"] - b["x0"]), abs(box["x1"] - b["x1"]), abs(box["x0"]+box["x1"] - b["x1"] - b["x0"])/2)
-            if dis < min_dis:
-                min_i = i
-                min_dis = dis
-        return min_i
-
-    @staticmethod
-    def find_overlapped_with_threashold(box, boxes, thr=0.3):
-        if not boxes:
-            return
-        max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
-        s, e = 0, len(boxes)
-        for i in range(s, e):
-            ov = Recognizer.overlapped_area(box, boxes[i])
-            _ov = Recognizer.overlapped_area(boxes[i], box)
-            if (ov, _ov) < (max_overlapped, _max_overlapped):
-                continue
-            max_overlapped_i = i
-            max_overlapped = ov
-            _max_overlapped = _ov
-
-        return max_overlapped_i
-
-    def preprocess(self, image_list):
-        inputs = []
-        if "scale_factor" in self.input_names:
-            preprocess_ops = []
-            for op_info in [
-                {'interp': 2, 'keep_ratio': False, 'target_size': [800, 608], 'type': 'LinearResize'},
-                {'is_scale': True, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'type': 'StandardizeImage'},
-                {'type': 'Permute'},
-                {'stride': 32, 'type': 'PadStride'}
-            ]:
-                new_op_info = op_info.copy()
-                op_type = new_op_info.pop('type')
-                preprocess_ops.append(getattr(operators, op_type)(**new_op_info))
-
-            for im_path in image_list:
-                im, im_info = preprocess(im_path, preprocess_ops)
-                inputs.append({"image": np.array((im,)).astype('float32'),
-                               "scale_factor": np.array((im_info["scale_factor"],)).astype('float32')})
-        else:
-            hh, ww = self.input_shape
-            for img in image_list:
-                h, w = img.shape[:2]
-                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-                img = cv2.resize(np.array(img).astype('float32'), (ww, hh))
-                # Scale input pixel values to 0 to 1
-                img /= 255.0
-                img = img.transpose(2, 0, 1)
-                img = img[np.newaxis, :, :, :].astype(np.float32)
-                inputs.append({self.input_names[0]: img, "scale_factor": [w/ww, h/hh]})
-        return inputs
-
-    def postprocess(self, boxes, inputs, thr):
-        if "scale_factor" in self.input_names:
-            bb = []
-            for b in boxes:
-                clsid, bbox, score = int(b[0]), b[2:], b[1]
-                if score < thr:
-                    continue
-                if clsid >= len(self.label_list):
-                    continue
-                bb.append({
-                    "type": self.label_list[clsid].lower(),
-                    "bbox": [float(t) for t in bbox.tolist()],
-                    "score": float(score)
-                })
-            return bb
-
-        def xywh2xyxy(x):
-            # [x, y, w, h] to [x1, y1, x2, y2]
-            y = np.copy(x)
-            y[:, 0] = x[:, 0] - x[:, 2] / 2
-            y[:, 1] = x[:, 1] - x[:, 3] / 2
-            y[:, 2] = x[:, 0] + x[:, 2] / 2
-            y[:, 3] = x[:, 1] + x[:, 3] / 2
-            return y
-
-        def compute_iou(box, boxes):
-            # Compute xmin, ymin, xmax, ymax for both boxes
-            xmin = np.maximum(box[0], boxes[:, 0])
-            ymin = np.maximum(box[1], boxes[:, 1])
-            xmax = np.minimum(box[2], boxes[:, 2])
-            ymax = np.minimum(box[3], boxes[:, 3])
-
-            # Compute intersection area
-            intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
-
-            # Compute union area
-            box_area = (box[2] - box[0]) * (box[3] - box[1])
-            boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-            union_area = box_area + boxes_area - intersection_area
-
-            # Compute IoU
-            iou = intersection_area / union_area
-
-            return iou
-
-        def iou_filter(boxes, scores, iou_threshold):
-            sorted_indices = np.argsort(scores)[::-1]
-
-            keep_boxes = []
-            while sorted_indices.size > 0:
-                # Pick the last box
-                box_id = sorted_indices[0]
-                keep_boxes.append(box_id)
-
-                # Compute IoU of the picked box with the rest
-                ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
-
-                # Remove boxes with IoU over the threshold
-                keep_indices = np.where(ious < iou_threshold)[0]
-
-                # print(keep_indices.shape, sorted_indices.shape)
-                sorted_indices = sorted_indices[keep_indices + 1]
-
-            return keep_boxes
-
-        boxes = np.squeeze(boxes).T
-        # Filter out object confidence scores below threshold
-        scores = np.max(boxes[:, 4:], axis=1)
-        boxes = boxes[scores > thr, :]
-        scores = scores[scores > thr]
-        if len(boxes) == 0:
-            return []
-
-        # Get the class with the highest confidence
-        class_ids = np.argmax(boxes[:, 4:], axis=1)
-        boxes = boxes[:, :4]
-        input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0], inputs["scale_factor"][1]])
-        boxes = np.multiply(boxes, input_shape, dtype=np.float32)
-        boxes = xywh2xyxy(boxes)
-
-        unique_class_ids = np.unique(class_ids)
-        indices = []
-        for class_id in unique_class_ids:
-            class_indices = np.where(class_ids == class_id)[0]
-            class_boxes = boxes[class_indices, :]
-            class_scores = scores[class_indices]
-            class_keep_boxes = iou_filter(class_boxes, class_scores, 0.2)
-            indices.extend(class_indices[class_keep_boxes])
-
-        return [{
-            "type": self.label_list[class_ids[i]].lower(),
-            "bbox": [float(t) for t in boxes[i].tolist()],
-            "score": float(scores[i])
-        } for i in indices]
-
-    def __call__(self, image_list, thr=0.7, batch_size=16):
-        res = []
-        imgs = []
-        for i in range(len(image_list)):
-            if not isinstance(image_list[i], np.ndarray):
-                imgs.append(np.array(image_list[i]))
-            else:
-                imgs.append(image_list[i])
-
-        batch_loop_cnt = math.ceil(float(len(imgs)) / batch_size)
-        for i in range(batch_loop_cnt):
-            start_index = i * batch_size
-            end_index = min((i + 1) * batch_size, len(imgs))
-            batch_image_list = imgs[start_index:end_index]
-            inputs = self.preprocess(batch_image_list)
-            logging.debug("preprocess")
-            for ins in inputs:
-                bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names}, self.run_options)[0], ins, thr)
-                res.append(bb)
-
-        #seeit.save_results(image_list, res, self.label_list, threshold=thr)
-
-        return res
-
-
-
--- a/deepdoc/vision/seeit.py
+++ b/deepdoc/vision/seeit.py
@ -1,87 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import os
-import PIL
-from PIL import ImageDraw
-
-
-def save_results(image_list, results, labels, output_dir='output/', threshold=0.5):
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-    for idx, im in enumerate(image_list):
-        im = draw_box(im, results[idx], labels, threshold=threshold)
-
-        out_path = os.path.join(output_dir, f"{idx}.jpg")
-        im.save(out_path, quality=95)
-        logging.debug("save result to: " + out_path)
-
-
-def draw_box(im, result, lables, threshold=0.5):
-    draw_thickness = min(im.size) // 320
-    draw = ImageDraw.Draw(im)
-    color_list = get_color_map_list(len(lables))
-    clsid2color = {n.lower():color_list[i] for i,n in enumerate(lables)}
-    result = [r for r in result if r["score"] >= threshold]
-
-    for dt in result:
-        color = tuple(clsid2color[dt["type"]])
-        xmin, ymin, xmax, ymax = dt["bbox"]
-        draw.line(
-            [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
-             (xmin, ymin)],
-            width=draw_thickness,
-            fill=color)
-
-        # draw label
-        text = "{} {:.4f}".format(dt["type"], dt["score"])
-        tw, th = imagedraw_textsize_c(draw, text)
-        draw.rectangle(
-            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
-        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
-    return im
-
-
-def get_color_map_list(num_classes):
-    """
-    Args:
-        num_classes (int): number of class
-    Returns:
-        color_map (list): RGB color list
-    """
-    color_map = num_classes * [0, 0, 0]
-    for i in range(0, num_classes):
-        j = 0
-        lab = i
-        while lab:
-            color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
-            color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
-            color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
-            j += 1
-            lab >>= 3
-    color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
-    return color_map
-
-
-def imagedraw_textsize_c(draw, text):
-    if int(PIL.__version__.split('.')[0]) < 10:
-        tw, th = draw.textsize(text)
-    else:
-        left, top, right, bottom = draw.textbbox((0, 0), text)
-        tw, th = right - left, bottom - top
-
-    return tw, th
--- a/deepdoc/vision/t_ocr.py
+++ b/deepdoc/vision/t_ocr.py
@ -1,59 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import os
-import sys
-sys.path.insert(
-    0,
-    os.path.abspath(
-        os.path.join(
-            os.path.dirname(
-                os.path.abspath(__file__)),
-            '../../')))
-
-from deepdoc.vision.seeit import draw_box
-from deepdoc.vision import OCR, init_in_out
-import argparse
-import numpy as np
-
-
-def main(args):
-    ocr = OCR()
-    images, outputs = init_in_out(args)
-
-    for i, img in enumerate(images):
-        bxs = ocr(np.array(img))
-        bxs = [(line[0], line[1][0]) for line in bxs]
-        bxs = [{
-            "text": t,
-            "bbox": [b[0][0], b[0][1], b[1][0], b[-1][1]],
-            "type": "ocr",
-            "score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
-        img = draw_box(images[i], bxs, ["ocr"], 1.)
-        img.save(outputs[i], quality=95)
-        with open(outputs[i] + ".txt", "w+", encoding='utf-8') as f:
-            f.write("\n".join([o["text"] for o in bxs]))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--inputs',
-                        help="Directory where to store images or PDFs, or a file path to a single image or PDF",
-                        required=True)
-    parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './ocr_outputs'",
-                        default="./ocr_outputs")
-    args = parser.parse_args()
-    main(args)
--- a/deepdoc/vision/t_recognizer.py
+++ b/deepdoc/vision/t_recognizer.py
@ -1,186 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import os
-import sys
-
-sys.path.insert(
-    0,
-    os.path.abspath(
-        os.path.join(
-            os.path.dirname(
-                os.path.abspath(__file__)),
-            '../../')))
-
-from deepdoc.vision.seeit import draw_box
-from deepdoc.vision import LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
-import argparse
-import re
-import numpy as np
-
-
-def main(args):
-    images, outputs = init_in_out(args)
-    if args.mode.lower() == "layout":
-        detr = LayoutRecognizer("layout")
-        layouts = detr.forward(images, thr=float(args.threshold))
-    if args.mode.lower() == "tsr":
-        detr = TableStructureRecognizer()
-        ocr = OCR()
-        layouts = detr(images, thr=float(args.threshold))
-    for i, lyt in enumerate(layouts):
-        if args.mode.lower() == "tsr":
-            #lyt = [t for t in lyt if t["type"] == "table column"]
-            html = get_table_html(images[i], lyt, ocr)
-            with open(outputs[i] + ".html", "w+", encoding='utf-8') as f:
-                f.write(html)
-            lyt = [{
-                "type": t["label"],
-                "bbox": [t["x0"], t["top"], t["x1"], t["bottom"]],
-                "score": t["score"]
-            } for t in lyt]
-        img = draw_box(images[i], lyt, detr.labels, float(args.threshold))
-        img.save(outputs[i], quality=95)
-        logging.info("save result to: " + outputs[i])
-
-
-def get_table_html(img, tb_cpns, ocr):
-    boxes = ocr(np.array(img))
-    boxes = LayoutRecognizer.sort_Y_firstly(
-        [{"x0": b[0][0], "x1": b[1][0],
-          "top": b[0][1], "text": t[0],
-          "bottom": b[-1][1],
-          "layout_type": "table",
-          "page_number": 0} for b, t in boxes if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
-        np.mean([b[-1][1] - b[0][1] for b, _ in boxes]) / 3
-    )
-
-    def gather(kwd, fzy=10, ption=0.6):
-        nonlocal boxes
-        eles = LayoutRecognizer.sort_Y_firstly(
-            [r for r in tb_cpns if re.match(kwd, r["label"])], fzy)
-        eles = LayoutRecognizer.layouts_cleanup(boxes, eles, 5, ption)
-        return LayoutRecognizer.sort_Y_firstly(eles, 0)
-
-    headers = gather(r".*header$")
-    rows = gather(r".* (row|header)")
-    spans = gather(r".*spanning")
-    clmns = sorted([r for r in tb_cpns if re.match(
-        r"table column$", r["label"])], key=lambda x: x["x0"])
-    clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5)
-
-    for b in boxes:
-        ii = LayoutRecognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
-        if ii is not None:
-            b["R"] = ii
-            b["R_top"] = rows[ii]["top"]
-            b["R_bott"] = rows[ii]["bottom"]
-
-        ii = LayoutRecognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
-        if ii is not None:
-            b["H_top"] = headers[ii]["top"]
-            b["H_bott"] = headers[ii]["bottom"]
-            b["H_left"] = headers[ii]["x0"]
-            b["H_right"] = headers[ii]["x1"]
-            b["H"] = ii
-
-        ii = LayoutRecognizer.find_horizontally_tightest_fit(b, clmns)
-        if ii is not None:
-            b["C"] = ii
-            b["C_left"] = clmns[ii]["x0"]
-            b["C_right"] = clmns[ii]["x1"]
-
-        ii = LayoutRecognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
-        if ii is not None:
-            b["H_top"] = spans[ii]["top"]
-            b["H_bott"] = spans[ii]["bottom"]
-            b["H_left"] = spans[ii]["x0"]
-            b["H_right"] = spans[ii]["x1"]
-            b["SP"] = ii
-
-    html = """
-    <html>
-    <head>
-    <style>
-    ._table_1nkzy_11 {
-      margin: auto;
-      width: 70%%;
-      padding: 10px;
-    }
-    ._table_1nkzy_11 p {
-      margin-bottom: 50px;
-      border: 1px solid #e1e1e1;
-    }
-
-    caption {
-      color: #6ac1ca;
-      font-size: 20px;
-      height: 50px;
-      line-height: 50px;
-      font-weight: 600;
-      margin-bottom: 10px;
-    }
-
-    ._table_1nkzy_11 table {
-      width: 100%%;
-      border-collapse: collapse;
-    }
-
-    th {
-      color: #fff;
-      background-color: #6ac1ca;
-    }
-
-    td:hover {
-      background: #c1e8e8;
-    }
-
-    tr:nth-child(even) {
-      background-color: #f2f2f2;
-    }
-
-    ._table_1nkzy_11 th,
-    ._table_1nkzy_11 td {
-      text-align: center;
-      border: 1px solid #ddd;
-      padding: 8px;
-    }
-    </style>
-    </head>
-    <body>
-    %s
-    </body>
-    </html>
-""" % TableStructureRecognizer.construct_table(boxes, html=True)
-    return html
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--inputs',
-                        help="Directory where to store images or PDFs, or a file path to a single image or PDF",
-                        required=True)
-    parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './layouts_outputs'",
-                        default="./layouts_outputs")
-    parser.add_argument(
-        '--threshold',
-        help="A threshold to filter out detections. Default: 0.5",
-        default=0.5)
-    parser.add_argument('--mode', help="Task mode: layout recognition or table structure recognition", choices=["layout", "tsr"],
-                        default="layout")
-    args = parser.parse_args()
-    main(args)
--- a/deepdoc/vision/table_structure_recognizer.py
+++ b/deepdoc/vision/table_structure_recognizer.py
@ -1,587 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import logging
-import os
-import re
-from collections import Counter
-
-import numpy as np
-from huggingface_hub import snapshot_download
-
-from api.utils.file_utils import get_project_base_directory
-from rag.nlp import rag_tokenizer
-from .recognizer import Recognizer
-
-
-class TableStructureRecognizer(Recognizer):
-    labels = [
-        "table",
-        "table column",
-        "table row",
-        "table column header",
-        "table projected row header",
-        "table spanning cell",
-    ]
-
-    def __init__(self):
-        try:
-            super().__init__(self.labels, "tsr", os.path.join(
-                    get_project_base_directory(),
-                    "rag/res/deepdoc"))
-        except Exception:
-            super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc",
-                                              local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
-                                              local_dir_use_symlinks=False))
-
-    def __call__(self, images, thr=0.2):
-        tbls = super().__call__(images, thr)
-        res = []
-        # align left&right for rows, align top&bottom for columns
-        for tbl in tbls:
-            lts = [{"label": b["type"],
-                    "score": b["score"],
-                    "x0": b["bbox"][0], "x1": b["bbox"][2],
-                    "top": b["bbox"][1], "bottom": b["bbox"][-1]
-                    } for b in tbl]
-            if not lts:
-                continue
-
-            left = [b["x0"] for b in lts if b["label"].find(
-                "row") > 0 or b["label"].find("header") > 0]
-            right = [b["x1"] for b in lts if b["label"].find(
-                "row") > 0 or b["label"].find("header") > 0]
-            if not left:
-                continue
-            left = np.mean(left) if len(left) > 4 else np.min(left)
-            right = np.mean(right) if len(right) > 4 else np.max(right)
-            for b in lts:
-                if b["label"].find("row") > 0 or b["label"].find("header") > 0:
-                    if b["x0"] > left:
-                        b["x0"] = left
-                    if b["x1"] < right:
-                        b["x1"] = right
-
-            top = [b["top"] for b in lts if b["label"] == "table column"]
-            bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
-            if not top:
-                res.append(lts)
-                continue
-            top = np.median(top) if len(top) > 4 else np.min(top)
-            bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
-            for b in lts:
-                if b["label"] == "table column":
-                    if b["top"] > top:
-                        b["top"] = top
-                    if b["bottom"] < bottom:
-                        b["bottom"] = bottom
-
-            res.append(lts)
-        return res
-
-    @staticmethod
-    def is_caption(bx):
-        patt = [
-            r"[图表]+[ 0-9:：]{2,}"
-        ]
-        if any([re.match(p, bx["text"].strip()) for p in patt]) \
-                or bx["layout_type"].find("caption") >= 0:
-            return True
-        return False
-
-    @staticmethod
-    def blockType(b):
-        patt = [
-            ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
-            (r"^(20|19)[0-9]{2}年$", "Dt"),
-            (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
-            ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
-            (r"^第*[一二三四1-4]季度$", "Dt"),
-            (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
-            (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
-            ("^[0-9.,+%/ -]+$", "Nu"),
-            (r"^[0-9A-Z/\._~-]+$", "Ca"),
-            (r"^[A-Z]*[a-z' -]+$", "En"),
-            (r"^[0-9.,+-]+[0-9A-Za-z/$￥%<>（）()' -]+$", "NE"),
-            (r"^.{1}$", "Sg")
-        ]
-        for p, n in patt:
-            if re.search(p, b["text"].strip()):
-                return n
-        tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1]
-        if len(tks) > 3:
-            if len(tks) < 12:
-                return "Tx"
-            else:
-                return "Lx"
-
-        if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
-            return "Nr"
-
-        return "Ot"
-
-    @staticmethod
-    def construct_table(boxes, is_english=False, html=False):
-        cap = ""
-        i = 0
-        while i < len(boxes):
-            if TableStructureRecognizer.is_caption(boxes[i]):
-                if is_english:
-                    cap + " "
-                cap += boxes[i]["text"]
-                boxes.pop(i)
-                i -= 1
-            i += 1
-
-        if not boxes:
-            return []
-        for b in boxes:
-            b["btype"] = TableStructureRecognizer.blockType(b)
-        max_type = Counter([b["btype"] for b in boxes]).items()
-        max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
-        logging.debug("MAXTYPE: " + max_type)
-
-        rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
-        rowh = np.min(rowh) if rowh else 0
-        boxes = Recognizer.sort_R_firstly(boxes, rowh / 2)
-        #for b in boxes:print(b)
-        boxes[0]["rn"] = 0
-        rows = [[boxes[0]]]
-        btm = boxes[0]["bottom"]
-        for b in boxes[1:]:
-            b["rn"] = len(rows) - 1
-            lst_r = rows[-1]
-            if lst_r[-1].get("R", "") != b.get("R", "") \
-                    or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
-                        ):  # new row
-                btm = b["bottom"]
-                b["rn"] += 1
-                rows.append([b])
-                continue
-            btm = (btm + b["bottom"]) / 2.
-            rows[-1].append(b)
-
-        colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
-        colwm = np.min(colwm) if colwm else 0
-        crosspage = len(set([b["page_number"] for b in boxes])) > 1
-        if crosspage:
-            boxes = Recognizer.sort_X_firstly(boxes, colwm / 2)
-        else:
-            boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
-        boxes[0]["cn"] = 0
-        cols = [[boxes[0]]]
-        right = boxes[0]["x1"]
-        for b in boxes[1:]:
-            b["cn"] = len(cols) - 1
-            lst_c = cols[-1]
-            if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
-                "page_number"]) \
-                    or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")):  # new col
-                right = b["x1"]
-                b["cn"] += 1
-                cols.append([b])
-                continue
-            right = (right + b["x1"]) / 2.
-            cols[-1].append(b)
-
-        tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
-        for b in boxes:
-            tbl[b["rn"]][b["cn"]].append(b)
-
-        if len(rows) >= 4:
-            # remove single in column
-            j = 0
-            while j < len(tbl[0]):
-                e, ii = 0, 0
-                for i in range(len(tbl)):
-                    if tbl[i][j]:
-                        e += 1
-                        ii = i
-                    if e > 1:
-                        break
-                if e > 1:
-                    j += 1
-                    continue
-                f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
-                     [j - 1][0].get("text")) or j == 0
-                ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
-                      [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
-                if f and ff:
-                    j += 1
-                    continue
-                bx = tbl[ii][j][0]
-                logging.debug("Relocate column single: " + bx["text"])
-                # j column only has one value
-                left, right = 100000, 100000
-                if j > 0 and not f:
-                    for i in range(len(tbl)):
-                        if tbl[i][j - 1]:
-                            left = min(left, np.min(
-                                [bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
-                if j + 1 < len(tbl[0]) and not ff:
-                    for i in range(len(tbl)):
-                        if tbl[i][j + 1]:
-                            right = min(right, np.min(
-                                [a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
-                assert left < 100000 or right < 100000
-                if left < right:
-                    for jj in range(j, len(tbl[0])):
-                        for i in range(len(tbl)):
-                            for a in tbl[i][jj]:
-                                a["cn"] -= 1
-                    if tbl[ii][j - 1]:
-                        tbl[ii][j - 1].extend(tbl[ii][j])
-                    else:
-                        tbl[ii][j - 1] = tbl[ii][j]
-                    for i in range(len(tbl)):
-                        tbl[i].pop(j)
-
-                else:
-                    for jj in range(j + 1, len(tbl[0])):
-                        for i in range(len(tbl)):
-                            for a in tbl[i][jj]:
-                                a["cn"] -= 1
-                    if tbl[ii][j + 1]:
-                        tbl[ii][j + 1].extend(tbl[ii][j])
-                    else:
-                        tbl[ii][j + 1] = tbl[ii][j]
-                    for i in range(len(tbl)):
-                        tbl[i].pop(j)
-                cols.pop(j)
-        assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
-            len(cols), len(tbl[0]))
-
-        if len(cols) >= 4:
-            # remove single in row
-            i = 0
-            while i < len(tbl):
-                e, jj = 0, 0
-                for j in range(len(tbl[i])):
-                    if tbl[i][j]:
-                        e += 1
-                        jj = j
-                    if e > 1:
-                        break
-                if e > 1:
-                    i += 1
-                    continue
-                f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
-                     [jj][0].get("text")) or i == 0
-                ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
-                      [jj][0].get("text")) or i + 1 >= len(tbl)
-                if f and ff:
-                    i += 1
-                    continue
-
-                bx = tbl[i][jj][0]
-                logging.debug("Relocate row single: " + bx["text"])
-                # i row only has one value
-                up, down = 100000, 100000
-                if i > 0 and not f:
-                    for j in range(len(tbl[i - 1])):
-                        if tbl[i - 1][j]:
-                            up = min(up, np.min(
-                                [bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
-                if i + 1 < len(tbl) and not ff:
-                    for j in range(len(tbl[i + 1])):
-                        if tbl[i + 1][j]:
-                            down = min(down, np.min(
-                                [a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
-                assert up < 100000 or down < 100000
-                if up < down:
-                    for ii in range(i, len(tbl)):
-                        for j in range(len(tbl[ii])):
-                            for a in tbl[ii][j]:
-                                a["rn"] -= 1
-                    if tbl[i - 1][jj]:
-                        tbl[i - 1][jj].extend(tbl[i][jj])
-                    else:
-                        tbl[i - 1][jj] = tbl[i][jj]
-                    tbl.pop(i)
-
-                else:
-                    for ii in range(i + 1, len(tbl)):
-                        for j in range(len(tbl[ii])):
-                            for a in tbl[ii][j]:
-                                a["rn"] -= 1
-                    if tbl[i + 1][jj]:
-                        tbl[i + 1][jj].extend(tbl[i][jj])
-                    else:
-                        tbl[i + 1][jj] = tbl[i][jj]
-                    tbl.pop(i)
-                rows.pop(i)
-
-        # which rows are headers
-        hdset = set([])
-        for i in range(len(tbl)):
-            cnt, h = 0, 0
-            for j, arr in enumerate(tbl[i]):
-                if not arr:
-                    continue
-                cnt += 1
-                if max_type == "Nu" and arr[0]["btype"] == "Nu":
-                    continue
-                if any([a.get("H") for a in arr]) \
-                        or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
-                    h += 1
-            if h / cnt > 0.5:
-                hdset.add(i)
-
-        if html:
-            return TableStructureRecognizer.__html_table(cap, hdset,
-                                                         TableStructureRecognizer.__cal_spans(boxes, rows,
-                                                                                              cols, tbl, True)
-                                                         )
-
-        return TableStructureRecognizer.__desc_table(cap, hdset,
-                                                     TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl,
-                                                                                          False),
-                                                     is_english)
-
-    @staticmethod
-    def __html_table(cap, hdset, tbl):
-        # constrcut HTML
-        html = "<table>"
-        if cap:
-            html += f"<caption>{cap}</caption>"
-        for i in range(len(tbl)):
-            row = "<tr>"
-            txts = []
-            for j, arr in enumerate(tbl[i]):
-                if arr is None:
-                    continue
-                if not arr:
-                    row += "<td></td>" if i not in hdset else "<th></th>"
-                    continue
-                txt = ""
-                if arr:
-                    h = min(np.min([c["bottom"] - c["top"]
-                            for c in arr]) / 2, 10)
-                    txt = " ".join([c["text"]
-                                   for c in Recognizer.sort_Y_firstly(arr, h)])
-                txts.append(txt)
-                sp = ""
-                if arr[0].get("colspan"):
-                    sp = "colspan={}".format(arr[0]["colspan"])
-                if arr[0].get("rowspan"):
-                    sp += " rowspan={}".format(arr[0]["rowspan"])
-                if i in hdset:
-                    row += f"<th {sp} >" + txt + "</th>"
-                else:
-                    row += f"<td {sp} >" + txt + "</td>"
-
-            if i in hdset:
-                if all([t in hdset for t in txts]):
-                    continue
-                for t in txts:
-                    hdset.add(t)
-
-            if row != "<tr>":
-                row += "</tr>"
-            else:
-                row = ""
-            html += "\n" + row
-        html += "\n</table>"
-        return html
-
-    @staticmethod
-    def __desc_table(cap, hdr_rowno, tbl, is_english):
-        # get text of every colomn in header row to become header text
-        clmno = len(tbl[0])
-        rowno = len(tbl)
-        headers = {}
-        hdrset = set()
-        lst_hdr = []
-        de = "的" if not is_english else " for "
-        for r in sorted(list(hdr_rowno)):
-            headers[r] = ["" for _ in range(clmno)]
-            for i in range(clmno):
-                if not tbl[r][i]:
-                    continue
-                txt = " ".join([a["text"].strip() for a in tbl[r][i]])
-                headers[r][i] = txt
-                hdrset.add(txt)
-            if all([not t for t in headers[r]]):
-                del headers[r]
-                hdr_rowno.remove(r)
-                continue
-            for j in range(clmno):
-                if headers[r][j]:
-                    continue
-                if j >= len(lst_hdr):
-                    break
-                headers[r][j] = lst_hdr[j]
-            lst_hdr = headers[r]
-        for i in range(rowno):
-            if i not in hdr_rowno:
-                continue
-            for j in range(i + 1, rowno):
-                if j not in hdr_rowno:
-                    break
-                for k in range(clmno):
-                    if not headers[j - 1][k]:
-                        continue
-                    if headers[j][k].find(headers[j - 1][k]) >= 0:
-                        continue
-                    if len(headers[j][k]) > len(headers[j - 1][k]):
-                        headers[j][k] += (de if headers[j][k]
-                                          else "") + headers[j - 1][k]
-                    else:
-                        headers[j][k] = headers[j - 1][k] \
-                            + (de if headers[j - 1][k] else "") \
-                            + headers[j][k]
-
-        logging.debug(
-            f">>>>>>>>>>>>>>>>>{cap}：SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
-        row_txt = []
-        for i in range(rowno):
-            if i in hdr_rowno:
-                continue
-            rtxt = []
-
-            def append(delimer):
-                nonlocal rtxt, row_txt
-                rtxt = delimer.join(rtxt)
-                if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
-                    row_txt[-1] += "\n" + rtxt
-                else:
-                    row_txt.append(rtxt)
-
-            r = 0
-            if len(headers.items()):
-                _arr = [(i - r, r) for r, _ in headers.items() if r < i]
-                if _arr:
-                    _, r = min(_arr, key=lambda x: x[0])
-
-            if r not in headers and clmno <= 2:
-                for j in range(clmno):
-                    if not tbl[i][j]:
-                        continue
-                    txt = "".join([a["text"].strip() for a in tbl[i][j]])
-                    if txt:
-                        rtxt.append(txt)
-                if rtxt:
-                    append("：")
-                continue
-
-            for j in range(clmno):
-                if not tbl[i][j]:
-                    continue
-                txt = "".join([a["text"].strip() for a in tbl[i][j]])
-                if not txt:
-                    continue
-                ctt = headers[r][j] if r in headers else ""
-                if ctt:
-                    ctt += "："
-                ctt += txt
-                if ctt:
-                    rtxt.append(ctt)
-
-            if rtxt:
-                row_txt.append("; ".join(rtxt))
-
-        if cap:
-            if is_english:
-                from_ = " in "
-            else:
-                from_ = "来自"
-            row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
-        return row_txt
-
-    @staticmethod
-    def __cal_spans(boxes, rows, cols, tbl, html=True):
-        # caculate span
-        clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
-                for cln in cols]
-        crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
-                for cln in cols]
-        rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
-                for row in rows]
-        rbtm = [np.mean([c.get("R_btm", c["bottom"])
-                         for c in row]) for row in rows]
-        for b in boxes:
-            if "SP" not in b:
-                continue
-            b["colspan"] = [b["cn"]]
-            b["rowspan"] = [b["rn"]]
-            # col span
-            for j in range(0, len(clft)):
-                if j == b["cn"]:
-                    continue
-                if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
-                    continue
-                if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
-                    continue
-                b["colspan"].append(j)
-            # row span
-            for j in range(0, len(rtop)):
-                if j == b["rn"]:
-                    continue
-                if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
-                    continue
-                if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
-                    continue
-                b["rowspan"].append(j)
-
-        def join(arr):
-            if not arr:
-                return ""
-            return "".join([t["text"] for t in arr])
-
-        # rm the spaning cells
-        for i in range(len(tbl)):
-            for j, arr in enumerate(tbl[i]):
-                if not arr:
-                    continue
-                if all(["rowspan" not in a and "colspan" not in a for a in arr]):
-                    continue
-                rowspan, colspan = [], []
-                for a in arr:
-                    if isinstance(a.get("rowspan", 0), list):
-                        rowspan.extend(a["rowspan"])
-                    if isinstance(a.get("colspan", 0), list):
-                        colspan.extend(a["colspan"])
-                rowspan, colspan = set(rowspan), set(colspan)
-                if len(rowspan) < 2 and len(colspan) < 2:
-                    for a in arr:
-                        if "rowspan" in a:
-                            del a["rowspan"]
-                        if "colspan" in a:
-                            del a["colspan"]
-                    continue
-                rowspan, colspan = sorted(rowspan), sorted(colspan)
-                rowspan = list(range(rowspan[0], rowspan[-1] + 1))
-                colspan = list(range(colspan[0], colspan[-1] + 1))
-                assert i in rowspan, rowspan
-                assert j in colspan, colspan
-                arr = []
-                for r in rowspan:
-                    for c in colspan:
-                        arr_txt = join(arr)
-                        if tbl[r][c] and join(tbl[r][c]) != arr_txt:
-                            arr.extend(tbl[r][c])
-                        tbl[r][c] = None if html else arr
-                for a in arr:
-                    if len(rowspan) > 1:
-                        a["rowspan"] = len(rowspan)
-                    elif "rowspan" in a:
-                        del a["rowspan"]
-                    if len(colspan) > 1:
-                        a["colspan"] = len(colspan)
-                    elif "colspan" in a:
-                        del a["colspan"]
-                tbl[rowspan[0]][colspan[0]] = arr
-
-        return tbl
--- a/management/web/types/auto/svg-component-global.d.ts
+++ b/management/web/types/auto/svg-component-global.d.ts
@ -9,18 +9,18 @@ declare module 'vue' {
  export interface GlobalComponents {
    SvgIcon: import("vue").DefineComponent<{
        name: {
-            type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management">;
+            type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management">;
            default: string;
            required: true;
        };
    }, {}, unknown, {}, {}, import("vue").ComponentOptionsMixin, import("vue").ComponentOptionsMixin, {}, string, import("vue").VNodeProps & import("vue").AllowedComponentProps & import("vue").ComponentCustomProps, Readonly<import("vue").ExtractPropTypes<{
        name: {
-            type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management">;
+            type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management">;
            default: string;
            required: true;
        };
    }>>, {
-        name: "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management";
+        name: "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management";
    }>;
  }
 }
--- a/management/web/types/auto/svg-component.d.ts
+++ b/management/web/types/auto/svg-component.d.ts
@ -7,20 +7,20 @@
 declare module '~virtual/svg-component' {
  const SvgIcon: import("vue").DefineComponent<{
      name: {
-          type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management">;
+          type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management">;
          default: string;
          required: true;
      };
  }, {}, unknown, {}, {}, import("vue").ComponentOptionsMixin, import("vue").ComponentOptionsMixin, {}, string, import("vue").VNodeProps & import("vue").AllowedComponentProps & import("vue").ComponentCustomProps, Readonly<import("vue").ExtractPropTypes<{
      name: {
-          type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management">;
+          type: import("vue").PropType<"conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management">;
          default: string;
          required: true;
      };
  }>>, {
-      name: "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management";
+      name: "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management";
  }>;
-  export const svgNames: ["conversation", "dashboard", "file", "fullscreen-exit", "fullscreen", "kb", "keyboard-down", "keyboard-enter", "keyboard-esc", "keyboard-up", "search", "team-management", "user-config", "user-management"];
-  export type SvgName = "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "team-management" | "user-config" | "user-management";
+  export const svgNames: ["conversation", "dashboard", "file", "fullscreen-exit", "fullscreen", "kb", "keyboard-down", "keyboard-enter", "keyboard-esc", "keyboard-up", "search", "storage", "team-management", "user-config", "user-management"];
+  export type SvgName = "conversation" | "dashboard" | "file" | "fullscreen-exit" | "fullscreen" | "kb" | "keyboard-down" | "keyboard-enter" | "keyboard-esc" | "keyboard-up" | "search" | "storage" | "team-management" | "user-config" | "user-management";
  export default SvgIcon;
 }
--- a/rag/app/init.py
+++ b/rag/app/init.py
@ -1,15 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
--- a/rag/app/audio.py
+++ b/rag/app/audio.py
@ -1,44 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import re
-
-from api.db import LLMType
-from rag.nlp import rag_tokenizer
-from api.db.services.llm_service import LLMBundle
-from rag.nlp import tokenize
-
-
-def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-    }
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-
-    # is it English
-    eng = lang.lower() == "english"  # is_english(sections)
-    try:
-        callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
-        seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
-        ans = seq2txt_mdl.transcription(binary)
-        callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
-        tokenize(doc, ans, eng)
-        return [doc]
-    except Exception as e:
-        callback(prog=-1, msg=str(e))
-
-    return []
--- a/rag/app/book.py
+++ b/rag/app/book.py
@ -1,157 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-from tika import parser
-import re
-from io import BytesIO
-
-from deepdoc.parser.utils import get_text
-from rag.nlp import bullets_category, is_english,remove_contents_table, \
-    hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
-    tokenize_chunks
-from rag.nlp import rag_tokenizer
-from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
-
-
-class Pdf(PdfParser):
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
-        from timeit import default_timer as timer
-        start = timer()
-        callback(msg="OCR started")
-        self.__images__(
-            filename if not binary else binary,
-            zoomin,
-            from_page,
-            to_page,
-            callback)
-        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._layouts_rec(zoomin)
-        callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
-        logging.debug("layouts: {}".format(timer() - start))
-
-        start = timer()
-        self._table_transformer_job(zoomin)
-        callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._text_merge()
-        tbls = self._extract_table_figure(True, zoomin, True, True)
-        self._naive_vertical_merge()
-        self._filter_forpages()
-        self._merge_with_same_bullet()
-        callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
-
-        return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
-                for b in self.boxes], tbls
-
-
-def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
-    """
-        Supported file formats are docx, pdf, txt.
-        Since a book is long and not all the parts are useful, if it's a PDF,
-        please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
-    """
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-    }
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    pdf_parser = None
-    sections, tbls = [], []
-    if re.search(r"\.docx$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        doc_parser = DocxParser()
-        # TODO: table of contents need to be removed
-        sections, tbls = doc_parser(
-            binary if binary else filename, from_page=from_page, to_page=to_page)
-        remove_contents_table(sections, eng=is_english(
-            random_choices([t for t, _ in sections], k=200)))
-        tbls = [((None, lns), None) for lns in tbls]
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf()
-        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
-            pdf_parser = PlainParser()
-        sections, tbls = pdf_parser(filename if not binary else binary,
-                                    from_page=from_page, to_page=to_page, callback=callback)
-
-    elif re.search(r"\.txt$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        txt = get_text(filename, binary)
-        sections = txt.split("\n")
-        sections = [(line, "") for line in sections if line]
-        remove_contents_table(sections, eng=is_english(
-            random_choices([t for t, _ in sections], k=200)))
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections = HtmlParser()(filename, binary)
-        sections = [(line, "") for line in sections if line]
-        remove_contents_table(sections, eng=is_english(
-            random_choices([t for t, _ in sections], k=200)))
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.doc$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        binary = BytesIO(binary)
-        doc_parsed = parser.from_buffer(binary)
-        sections = doc_parsed['content'].split('\n')
-        sections = [(line, "") for line in sections if line]
-        remove_contents_table(sections, eng=is_english(
-            random_choices([t for t, _ in sections], k=200)))
-        callback(0.8, "Finish parsing.")
-
-    else:
-        raise NotImplementedError(
-            "file type not supported yet(doc, docx, pdf, txt supported)")
-
-    make_colon_as_title(sections)
-    bull = bullets_category(
-        [t for t in random_choices([t for t, _ in sections], k=100)])
-    if bull >= 0:
-        chunks = ["\n".join(ck)
-                  for ck in hierarchical_merge(bull, sections, 5)]
-    else:
-        sections = [s.split("@") for s, _ in sections]
-        sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ]
-        chunks = naive_merge(
-            sections, kwargs.get(
-                "chunk_token_num", 256), kwargs.get(
-                "delimer", "\n。；！？"))
-
-    # is it English
-    # is_english(random_choices([t for t, _ in sections], k=218))
-    eng = lang.lower() == "english"
-
-    res = tokenize_table(tbls, doc, eng)
-    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
-
-    return res
-
-
-if __name__ == "__main__":
-    import sys
-
-    def dummy(prog=None, msg=""):
-        pass
-    chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
--- a/rag/app/email.py
+++ b/rag/app/email.py
@ -1,117 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-from email import policy
-from email.parser import BytesParser
-from rag.app.naive import chunk as naive_chunk
-import re
-from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
-from deepdoc.parser import HtmlParser, TxtParser
-from timeit import default_timer as timer
-import io
-
-
-def chunk(
-    filename,
-    binary=None,
-    from_page=0,
-    to_page=100000,
-    lang="Chinese",
-    callback=None,
-    **kwargs,
-):
-    """
-    Only eml is supported
-    """
-    eng = lang.lower() == "english"  # is_english(cks)
-    parser_config = kwargs.get(
-        "parser_config",
-        {"chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"},
-    )
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
-    }
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    main_res = []
-    attachment_res = []
-
-    if binary:
-        msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
-    else:
-        msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
-
-    text_txt, html_txt = [], []
-    # get the email header info
-    for header, value in msg.items():
-        text_txt.append(f"{header}: {value}")
-
-    #  get the email main info
-    def _add_content(msg, content_type):
-        if content_type == "text/plain":
-            text_txt.append(
-                msg.get_payload(decode=True).decode(msg.get_content_charset())
-            )
-        elif content_type == "text/html":
-            html_txt.append(
-                msg.get_payload(decode=True).decode(msg.get_content_charset())
-            )
-        elif "multipart" in content_type:
-            if msg.is_multipart():
-                for part in msg.iter_parts():
-                    _add_content(part, part.get_content_type())
-
-    _add_content(msg, msg.get_content_type())
-
-    sections = TxtParser.parser_txt("\n".join(text_txt)) + [
-        (line, "") for line in HtmlParser.parser_txt("\n".join(html_txt)) if line
-    ]
-
-    st = timer()
-    chunks = naive_merge(
-        sections,
-        int(parser_config.get("chunk_token_num", 128)),
-        parser_config.get("delimiter", "\n!?。；！？"),
-    )
-
-    main_res.extend(tokenize_chunks(chunks, doc, eng, None))
-    logging.debug("naive_merge({}): {}".format(filename, timer() - st))
-    # get the attachment info
-    for part in msg.iter_attachments():
-        content_disposition = part.get("Content-Disposition")
-        if content_disposition:
-            dispositions = content_disposition.strip().split(";")
-            if dispositions[0].lower() == "attachment":
-                filename = part.get_filename()
-                payload = part.get_payload(decode=True)
-                try:
-                    attachment_res.extend(
-                        naive_chunk(filename, payload, callback=callback, **kwargs)
-                    )
-                except Exception:
-                    pass
-
-    return main_res + attachment_res
-
-
-if __name__ == "__main__":
-    import sys
-
-    def dummy(prog=None, msg=""):
-        pass
-
-    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@ -1,216 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-from tika import parser
-import re
-from io import BytesIO
-from docx import Document
-
-from api.db import ParserType
-from deepdoc.parser.utils import get_text
-from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge, \
-    make_colon_as_title, tokenize_chunks, docx_question_level
-from rag.nlp import rag_tokenizer
-from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
-
-
-class Docx(DocxParser):
-    def __init__(self):
-        pass
-
-    def __clean(self, line):
-        line = re.sub(r"\u3000", " ", line).strip()
-        return line
-
-    def old_call(self, filename, binary=None, from_page=0, to_page=100000):
-        self.doc = Document(
-            filename) if not binary else Document(BytesIO(binary))
-        pn = 0
-        lines = []
-        for p in self.doc.paragraphs:
-            if pn > to_page:
-                break
-            if from_page <= pn < to_page and p.text.strip():
-                lines.append(self.__clean(p.text))
-            for run in p.runs:
-                if 'lastRenderedPageBreak' in run._element.xml:
-                    pn += 1
-                    continue
-                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
-                    pn += 1
-        return [line for line in lines if line]
-
-    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
-        self.doc = Document(
-            filename) if not binary else Document(BytesIO(binary))
-        pn = 0
-        lines = []
-        bull = bullets_category([p.text for p in self.doc.paragraphs])
-        for p in self.doc.paragraphs:
-            if pn > to_page:
-                break
-            question_level, p_text = docx_question_level(p, bull)
-            if not p_text.strip("\n"):
-                continue
-            lines.append((question_level, p_text))
-
-            for run in p.runs:
-                if 'lastRenderedPageBreak' in run._element.xml:
-                    pn += 1
-                    continue
-                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
-                    pn += 1
-
-        visit = [False for _ in range(len(lines))]
-        sections = []
-        for s in range(len(lines)):
-            e = s + 1
-            while e < len(lines):
-                if lines[e][0] <= lines[s][0]:
-                    break
-                e += 1
-            if e - s == 1 and visit[s]:
-                continue
-            sec = []
-            next_level = lines[s][0] + 1
-            while not sec and next_level < 22:
-                for i in range(s+1, e):
-                    if lines[i][0] != next_level:
-                        continue
-                    sec.append(lines[i][1])
-                    visit[i] = True
-                next_level += 1
-            sec.insert(0, lines[s][1])
-
-            sections.append("\n".join(sec))
-        return [s for s in sections if s]
-
-    def __str__(self) -> str:
-        return f'''
-            question:{self.question},
-            answer:{self.answer},
-            level:{self.level},
-            childs:{self.childs}
-        '''
-
-
-class Pdf(PdfParser):
-    def __init__(self):
-        self.model_speciess = ParserType.LAWS.value
-        super().__init__()
-
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
-        from timeit import default_timer as timer
-        start = timer()
-        callback(msg="OCR started")
-        self.__images__(
-            filename if not binary else binary,
-            zoomin,
-            from_page,
-            to_page,
-            callback
-        )
-        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._layouts_rec(zoomin)
-        callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
-        logging.debug("layouts:".format(
-            ))
-        self._naive_vertical_merge()
-
-        callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
-
-        return [(b["text"], self._line_tag(b, zoomin))
-                for b in self.boxes], None
-
-
-def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
-    """
-        Supported file formats are docx, pdf, txt.
-    """
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-    }
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    pdf_parser = None
-    sections = []
-    # is it English
-    eng = lang.lower() == "english"  # is_english(sections)
-
-    if re.search(r"\.docx$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        chunks = Docx()(filename, binary)
-        callback(0.7, "Finish parsing.")
-        return tokenize_chunks(chunks, doc, eng, None)
-
-    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf()
-        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
-            pdf_parser = PlainParser()
-        for txt, poss in pdf_parser(filename if not binary else binary,
-                                    from_page=from_page, to_page=to_page, callback=callback)[0]:
-            sections.append(txt + poss)
-
-    elif re.search(r"\.txt$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        txt = get_text(filename, binary)
-        sections = txt.split("\n")
-        sections = [s for s in sections if s]
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections = HtmlParser()(filename, binary)
-        sections = [s for s in sections if s]
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.doc$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        binary = BytesIO(binary)
-        doc_parsed = parser.from_buffer(binary)
-        sections = doc_parsed['content'].split('\n')
-        sections = [s for s in sections if s]
-        callback(0.8, "Finish parsing.")
-
-    else:
-        raise NotImplementedError(
-            "file type not supported yet(doc, docx, pdf, txt supported)")
-
-
-    # Remove 'Contents' part
-    remove_contents_table(sections, eng)
-
-    make_colon_as_title(sections)
-    bull = bullets_category(sections)
-    chunks = hierarchical_merge(bull, sections, 5)
-    if not chunks:
-        callback(0.99, "No chunk parsed out.")
-
-    return tokenize_chunks(["\n".join(ck)
-                           for ck in chunks], doc, eng, pdf_parser)
-
-
-if __name__ == "__main__":
-    import sys
-
-    def dummy(prog=None, msg=""):
-        pass
-    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -1,282 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import copy
-import re
-
-from api.db import ParserType
-from io import BytesIO
-from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
-from rag.utils import num_tokens_from_string
-from deepdoc.parser import PdfParser, PlainParser, DocxParser
-from docx import Document
-from PIL import Image
-
-
-class Pdf(PdfParser):
-    def __init__(self):
-        self.model_speciess = ParserType.MANUAL.value
-        super().__init__()
-
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
-        from timeit import default_timer as timer
-        start = timer()
-        callback(msg="OCR started")
-        self.__images__(
-            filename if not binary else binary,
-            zoomin,
-            from_page,
-            to_page,
-            callback
-        )
-        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
-        # for bb in self.boxes:
-        #    for b in bb:
-        #        print(b)
-        logging.debug("OCR: {}".format(timer() - start))
-
-        start = timer()
-        self._layouts_rec(zoomin)
-        callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start))
-        logging.debug("layouts: {}".format(timer() - start))
-
-        start = timer()
-        self._table_transformer_job(zoomin)
-        callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._text_merge()
-        tbls = self._extract_table_figure(True, zoomin, True, True)
-        self._concat_downward()
-        self._filter_forpages()
-        callback(0.68, "Text merged ({:.2f}s)".format(timer() - start))
-
-        # clean mess
-        for b in self.boxes:
-            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
-
-        return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin))
-                for i, b in enumerate(self.boxes)], tbls
-
-
-class Docx(DocxParser):
-    def __init__(self):
-        pass
-
-    def get_picture(self, document, paragraph):
-        img = paragraph._element.xpath('.//pic:pic')
-        if not img:
-            return None
-        img = img[0]
-        embed = img.xpath('.//a:blip/@r:embed')[0]
-        related_part = document.part.related_parts[embed]
-        image = related_part.image
-        image = Image.open(BytesIO(image.blob))
-        return image
-
-    def concat_img(self, img1, img2):
-        if img1 and not img2:
-            return img1
-        if not img1 and img2:
-            return img2
-        if not img1 and not img2:
-            return None
-        width1, height1 = img1.size
-        width2, height2 = img2.size
-
-        new_width = max(width1, width2)
-        new_height = height1 + height2
-        new_image = Image.new('RGB', (new_width, new_height))
-
-        new_image.paste(img1, (0, 0))
-        new_image.paste(img2, (0, height1))
-
-        return new_image
-
-    def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
-        self.doc = Document(
-            filename) if not binary else Document(BytesIO(binary))
-        pn = 0
-        last_answer, last_image = "", None
-        question_stack, level_stack = [], []
-        ti_list = []
-        for p in self.doc.paragraphs:
-            if pn > to_page:
-                break
-            question_level, p_text = 0, ''
-            if from_page <= pn < to_page and p.text.strip():
-                question_level, p_text = docx_question_level(p)
-            if not question_level or question_level > 6: # not a question
-                last_answer = f'{last_answer}\n{p_text}'
-                current_image = self.get_picture(self.doc, p)
-                last_image = self.concat_img(last_image, current_image)
-            else:   # is a question
-                if last_answer or last_image:
-                    sum_question = '\n'.join(question_stack)
-                    if sum_question:
-                        ti_list.append((f'{sum_question}\n{last_answer}', last_image))
-                    last_answer, last_image = '', None
-
-                i = question_level
-                while question_stack and i <= level_stack[-1]:
-                    question_stack.pop()
-                    level_stack.pop()
-                question_stack.append(p_text)
-                level_stack.append(question_level)
-            for run in p.runs:
-                if 'lastRenderedPageBreak' in run._element.xml:
-                    pn += 1
-                    continue
-                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
-                    pn += 1
-        if last_answer:
-            sum_question = '\n'.join(question_stack)
-            if sum_question:
-                ti_list.append((f'{sum_question}\n{last_answer}', last_image))
-                
-        tbls = []
-        for tb in self.doc.tables:
-            html= "<table>"
-            for r in tb.rows:
-                html += "<tr>"
-                i = 0
-                while i < len(r.cells):
-                    span = 1
-                    c = r.cells[i]
-                    for j in range(i+1, len(r.cells)):
-                        if c.text == r.cells[j].text:
-                            span += 1
-                            i = j
-                    i += 1
-                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
-                html += "</tr>"
-            html += "</table>"
-            tbls.append(((None, html), ""))
-        return ti_list, tbls
-
-
-def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
-    """
-        Only pdf is supported.
-    """
-    pdf_parser = None
-    doc = {
-        "docnm_kwd": filename
-    }
-    doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    # is it English
-    eng = lang.lower() == "english"  # pdf_parser.is_english
-    if re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf()
-        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
-            pdf_parser = PlainParser()
-        sections, tbls = pdf_parser(filename if not binary else binary,
-                                    from_page=from_page, to_page=to_page, callback=callback)
-        if sections and len(sections[0]) < 3:
-            sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
-        # set pivot using the most frequent type of title,
-        # then merge between 2 pivot
-        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
-            max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
-            most_level = max(0, max_lvl - 1)
-            levels = []
-            for txt, _, _ in sections:
-                for t, lvl in pdf_parser.outlines:
-                    tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
-                    tks_ = set([txt[i] + txt[i + 1]
-                                for i in range(min(len(t), len(txt) - 1))])
-                    if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
-                        levels.append(lvl)
-                        break
-                else:
-                    levels.append(max_lvl + 1)
-
-        else:
-            bull = bullets_category([txt for txt, _, _ in sections])
-            most_level, levels = title_frequency(
-                bull, [(txt, lvl) for txt, lvl, _ in sections])
-
-        assert len(sections) == len(levels)
-        sec_ids = []
-        sid = 0
-        for i, lvl in enumerate(levels):
-            if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
-                sid += 1
-            sec_ids.append(sid)
-            # print(lvl, self.boxes[i]["text"], most_level, sid)
-
-        sections = [(txt, sec_ids[i], poss)
-                    for i, (txt, _, poss) in enumerate(sections)]
-        for (img, rows), poss in tbls:
-            if not rows:
-                continue
-            sections.append((rows if isinstance(rows, str) else rows[0], -1,
-                            [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
-
-        def tag(pn, left, right, top, bottom):
-            if pn + left + right + top + bottom == 0:
-                return ""
-            return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
-                .format(pn, left, right, top, bottom)
-
-        chunks = []
-        last_sid = -2
-        tk_cnt = 0
-        for txt, sec_id, poss in sorted(sections, key=lambda x: (
-                x[-1][0][0], x[-1][0][3], x[-1][0][1])):
-            poss = "\t".join([tag(*pos) for pos in poss])
-            if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
-                if chunks:
-                    chunks[-1] += "\n" + txt + poss
-                    tk_cnt += num_tokens_from_string(txt)
-                    continue
-            chunks.append(txt + poss)
-            tk_cnt = num_tokens_from_string(txt)
-            if sec_id > -1:
-                last_sid = sec_id
-
-        res = tokenize_table(tbls, doc, eng)
-        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
-        return res
-
-    elif re.search(r"\.docx?$", filename, re.IGNORECASE):
-        docx_parser = Docx()
-        ti_list, tbls = docx_parser(filename, binary,
-                                    from_page=0, to_page=10000, callback=callback)
-        res = tokenize_table(tbls, doc, eng)
-        for text, image in ti_list:
-            d = copy.deepcopy(doc)
-            d['image'] = image
-            tokenize(d, text, eng)
-            res.append(d)
-        return res
-    else:
-        raise NotImplementedError("file type not supported yet(pdf and docx supported)")
-    
-
-if __name__ == "__main__":
-    import sys
-
-
-    def dummy(prog=None, msg=""):
-        pass
-
-
-    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -1,313 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-from tika import parser
-from io import BytesIO
-from docx import Document
-from timeit import default_timer as timer
-import re
-from deepdoc.parser.pdf_parser import PlainParser
-from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
-    naive_merge_docx, tokenize_chunks_docx
-from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
-from rag.utils import num_tokens_from_string
-from PIL import Image
-from functools import reduce
-from markdown import markdown
-from docx.image.exceptions import UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError
-
-
-class Docx(DocxParser):
-    def __init__(self):
-        pass
-
-    def get_picture(self, document, paragraph):
-        img = paragraph._element.xpath('.//pic:pic')
-        if not img:
-            return None
-        img = img[0]
-        embed = img.xpath('.//a:blip/@r:embed')[0]
-        related_part = document.part.related_parts[embed]
-        try:
-            image_blob = related_part.image.blob
-        except UnrecognizedImageError:
-            logging.info("Unrecognized image format. Skipping image.")
-            return None
-        except UnexpectedEndOfFileError:
-            logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
-            return None
-        except InvalidImageStreamError:
-            logging.info("The recognized image stream appears to be corrupted. Skipping image.")
-            return None
-        try:
-            image = Image.open(BytesIO(image_blob)).convert('RGB')
-            return image
-        except Exception:
-            return None
-
-    def __clean(self, line):
-        line = re.sub(r"\u3000", " ", line).strip()
-        return line
-
-    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
-        self.doc = Document(
-            filename) if not binary else Document(BytesIO(binary))
-        pn = 0
-        lines = []
-        last_image = None
-        for p in self.doc.paragraphs:
-            if pn > to_page:
-                break
-            if from_page <= pn < to_page:
-                if p.text.strip():
-                    if p.style and p.style.name == 'Caption':
-                        former_image = None
-                        if lines and lines[-1][1] and lines[-1][2] != 'Caption':
-                            former_image = lines[-1][1].pop()
-                        elif last_image:
-                            former_image = last_image
-                            last_image = None
-                        lines.append((self.__clean(p.text), [former_image], p.style.name))
-                    else:
-                        current_image = self.get_picture(self.doc, p)
-                        image_list = [current_image]
-                        if last_image:
-                            image_list.insert(0, last_image)
-                            last_image = None
-                        lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
-                else:
-                    if current_image := self.get_picture(self.doc, p):
-                        if lines:
-                            lines[-1][1].append(current_image)
-                        else:
-                            last_image = current_image
-            for run in p.runs:
-                if 'lastRenderedPageBreak' in run._element.xml:
-                    pn += 1
-                    continue
-                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
-                    pn += 1
-        new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
-
-        tbls = []
-        for tb in self.doc.tables:
-            html = "<table>"
-            for r in tb.rows:
-                html += "<tr>"
-                i = 0
-                while i < len(r.cells):
-                    span = 1
-                    c = r.cells[i]
-                    for j in range(i + 1, len(r.cells)):
-                        if c.text == r.cells[j].text:
-                            span += 1
-                            i = j
-                        else:
-                            break
-                    i += 1
-                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
-                html += "</tr>"
-            html += "</table>"
-            tbls.append(((None, html), ""))
-        return new_line, tbls
-
-
-class Pdf(PdfParser):
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
-        start = timer()
-        first_start = start
-        callback(msg="OCR started")
-        self.__images__(
-            filename if not binary else binary,
-            zoomin,
-            from_page,
-            to_page,
-            callback
-        )
-        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
-        logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
-
-        start = timer()
-        self._layouts_rec(zoomin)
-        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._table_transformer_job(zoomin)
-        callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._text_merge()
-        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
-        tbls = self._extract_table_figure(True, zoomin, True, True)
-        # self._naive_vertical_merge()
-        self._concat_downward()
-        # self._filter_forpages()
-
-        logging.info("layouts cost: {}s".format(timer() - first_start))
-        return [(b["text"], self._line_tag(b, zoomin))
-                for b in self.boxes], tbls
-
-
-class Markdown(MarkdownParser):
-    def __call__(self, filename, binary=None):
-        if binary:
-            encoding = find_codec(binary)
-            txt = binary.decode(encoding, errors="ignore")
-        else:
-            with open(filename, "r") as f:
-                txt = f.read()
-        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
-        sections = []
-        tbls = []
-        for sec in remainder.split("\n"):
-            if num_tokens_from_string(sec) > 3 * self.chunk_token_num:
-                sections.append((sec[:int(len(sec) / 2)], ""))
-                sections.append((sec[int(len(sec) / 2):], ""))
-            else:
-                if sec.strip().find("#") == 0:
-                    sections.append((sec, ""))
-                elif sections and sections[-1][0].strip().find("#") == 0:
-                    sec_, _ = sections.pop(-1)
-                    sections.append((sec_ + "\n" + sec, ""))
-                else:
-                    sections.append((sec, ""))
-
-        for table in tables:
-            tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
-        return sections, tbls
-
-
-def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
-    """
-        Supported file formats are docx, pdf, excel, txt.
-        This method apply the naive ways to chunk files.
-        Successive text will be sliced into pieces using 'delimiter'.
-        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
-    """
-
-    is_english = lang.lower() == "english"  # is_english(cks)
-    parser_config = kwargs.get(
-        "parser_config", {
-            "chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-    }
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    res = []
-    pdf_parser = None
-    if re.search(r"\.docx$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections, tables = Docx()(filename, binary)
-        res = tokenize_table(tables, doc, is_english)  # just for table
-
-        callback(0.8, "Finish parsing.")
-        st = timer()
-
-        chunks, images = naive_merge_docx(
-            sections, int(parser_config.get(
-                "chunk_token_num", 128)), parser_config.get(
-                "delimiter", "\n!?。；！？"))
-
-        if kwargs.get("section_only", False):
-            return chunks
-
-        res.extend(tokenize_chunks_docx(chunks, doc, is_english, images))
-        logging.info("naive_merge({}): {}".format(filename, timer() - st))
-        return res
-
-    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf()
-        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
-            pdf_parser = PlainParser()
-        sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
-                                      callback=callback)
-        res = tokenize_table(tables, doc, is_english)
-
-    elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        excel_parser = ExcelParser()
-        if parser_config.get("html4excel"):
-            sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
-        else:
-            sections = [(_, "") for _ in excel_parser(binary) if _]
-
-    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections = TxtParser()(filename, binary,
-                               parser_config.get("chunk_token_num", 128),
-                               parser_config.get("delimiter", "\n!?;。；！？"))
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
-        res = tokenize_table(tables, doc, is_english)
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections = HtmlParser()(filename, binary)
-        sections = [(_, "") for _ in sections if _]
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.json$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        chunk_token_num = int(parser_config.get("chunk_token_num", 128))
-        sections = JsonParser(chunk_token_num)(binary)
-        sections = [(_, "") for _ in sections if _]
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.doc$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        binary = BytesIO(binary)
-        doc_parsed = parser.from_buffer(binary)
-        if doc_parsed.get('content', None) is not None:
-            sections = doc_parsed['content'].split('\n')
-            sections = [(_, "") for _ in sections if _]
-            callback(0.8, "Finish parsing.")
-        else:
-            callback(0.8, f"tika.parser got empty content from {filename}.")
-            logging.warning(f"tika.parser got empty content from {filename}.")
-            return []
-
-    else:
-        raise NotImplementedError(
-            "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
-
-    st = timer()
-    chunks = naive_merge(
-        sections, int(parser_config.get(
-            "chunk_token_num", 128)), parser_config.get(
-            "delimiter", "\n!?。；！？"))
-    if kwargs.get("section_only", False):
-        return chunks
-
-    res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
-    logging.info("naive_merge({}): {}".format(filename, timer() - st))
-    return res
-
-
-if __name__ == "__main__":
-    import sys
-
-    def dummy(prog=None, msg=""):
-        pass
-
-    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/app/one.py
+++ b/rag/app/one.py
@ -1,139 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-from tika import parser
-from io import BytesIO
-import re
-
-from deepdoc.parser.utils import get_text
-from rag.app import naive
-from rag.nlp import rag_tokenizer, tokenize
-from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
-
-
-class Pdf(PdfParser):
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
-        from timeit import default_timer as timer
-        start = timer()
-        callback(msg="OCR started")
-        self.__images__(
-            filename if not binary else binary,
-            zoomin,
-            from_page,
-            to_page,
-            callback
-        )
-        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._layouts_rec(zoomin, drop=False)
-        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
-        logging.debug("layouts cost: {}s".format(timer() - start))
-
-        start = timer()
-        self._table_transformer_job(zoomin)
-        callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._text_merge()
-        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
-        tbls = self._extract_table_figure(True, zoomin, True, True)
-        self._concat_downward()
-
-        sections = [(b["text"], self.get_position(b, zoomin))
-                    for i, b in enumerate(self.boxes)]
-        for (img, rows), poss in tbls:
-            if not rows:
-                continue
-            sections.append((rows if isinstance(rows, str) else rows[0],
-                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
-        return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
-            x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
-
-
-def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
-    """
-        Supported file formats are docx, pdf, excel, txt.
-        One file forms a chunk which maintains original text order.
-    """
-
-    eng = lang.lower() == "english"  # is_english(cks)
-
-    if re.search(r"\.docx$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections, tbls = naive.Docx()(filename, binary)
-        sections = [s for s, _ in sections if s]
-        for (_, html), _ in tbls:
-            sections.append(html)
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf()
-        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
-            pdf_parser = PlainParser()
-        sections, _ = pdf_parser(
-            filename if not binary else binary, to_page=to_page, callback=callback)
-        sections = [s for s, _ in sections if s]
-
-    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        excel_parser = ExcelParser()
-        sections = excel_parser.html(binary, 1000000000)
-
-    elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        txt = get_text(filename, binary)
-        sections = txt.split("\n")
-        sections = [s for s in sections if s]
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        sections = HtmlParser()(filename, binary)
-        sections = [s for s in sections if s]
-        callback(0.8, "Finish parsing.")
-
-    elif re.search(r"\.doc$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        binary = BytesIO(binary)
-        doc_parsed = parser.from_buffer(binary)
-        sections = doc_parsed['content'].split('\n')
-        sections = [s for s in sections if s]
-        callback(0.8, "Finish parsing.")
-
-    else:
-        raise NotImplementedError(
-            "file type not supported yet(doc, docx, pdf, txt supported)")
-
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-    }
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    tokenize(doc, "\n".join(sections), eng)
-    return [doc]
-
-
-if __name__ == "__main__":
-    import sys
-
-    def dummy(prog=None, msg=""):
-        pass
-
-    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -1,294 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import logging
-import copy
-import re
-
-from api.db import ParserType
-from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
-from deepdoc.parser import PdfParser, PlainParser
-import numpy as np
-
-
-class Pdf(PdfParser):
-    def __init__(self):
-        self.model_speciess = ParserType.PAPER.value
-        super().__init__()
-
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
-        from timeit import default_timer as timer
-        start = timer()
-        callback(msg="OCR started")
-        self.__images__(
-            filename if not binary else binary,
-            zoomin,
-            from_page,
-            to_page,
-            callback
-        )
-        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._layouts_rec(zoomin)
-        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
-        logging.debug(f"layouts cost: {timer() - start}s")
-
-        start = timer()
-        self._table_transformer_job(zoomin)
-        callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
-
-        start = timer()
-        self._text_merge()
-        tbls = self._extract_table_figure(True, zoomin, True, True)
-        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
-        self._concat_downward()
-        self._filter_forpages()
-        callback(0.75, "Text merged ({:.2f}s)".format(timer() - start))
-
-        # clean mess
-        if column_width < self.page_images[0].size[0] / zoomin / 2:
-            logging.debug("two_column................... {} {}".format(column_width,
-                  self.page_images[0].size[0] / zoomin / 2))
-            self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
-        for b in self.boxes:
-            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
-
-        def _begin(txt):
-            return re.match(
-                "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
-                txt.lower().strip())
-
-        if from_page > 0:
-            return {
-                "title": "",
-                "authors": "",
-                "abstract": "",
-                "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
-                             re.match(r"(text|title)", b.get("layoutno", "text"))],
-                "tables": tbls
-            }
-        # get title and authors
-        title = ""
-        authors = []
-        i = 0
-        while i < min(32, len(self.boxes)-1):
-            b = self.boxes[i]
-            i += 1
-            if b.get("layoutno", "").find("title") >= 0:
-                title = b["text"]
-                if _begin(title):
-                    title = ""
-                    break
-                for j in range(3):
-                    if _begin(self.boxes[i + j]["text"]):
-                        break
-                    authors.append(self.boxes[i + j]["text"])
-                    break
-                break
-        # get abstract
-        abstr = ""
-        i = 0
-        while i + 1 < min(32, len(self.boxes)):
-            b = self.boxes[i]
-            i += 1
-            txt = b["text"].lower().strip()
-            if re.match("(abstract|摘要)", txt):
-                if len(txt.split()) > 32 or len(txt) > 64:
-                    abstr = txt + self._line_tag(b, zoomin)
-                    break
-                txt = self.boxes[i]["text"].lower().strip()
-                if len(txt.split()) > 32 or len(txt) > 64:
-                    abstr = txt + self._line_tag(self.boxes[i], zoomin)
-                i += 1
-                break
-        if not abstr:
-            i = 0
-
-        callback(
-            0.8, "Page {}~{}: Text merging finished".format(
-                from_page, min(
-                    to_page, self.total_page)))
-        for b in self.boxes:
-            logging.debug("{} {}".format(b["text"], b.get("layoutno")))
-        logging.debug("{}".format(tbls))
-
-        return {
-            "title": title,
-            "authors": " ".join(authors),
-            "abstract": abstr,
-            "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
-                         re.match(r"(text|title)", b.get("layoutno", "text"))],
-            "tables": tbls
-        }
-
-
-def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
-    """
-        Only pdf is supported.
-        The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
-    """
-    if re.search(r"\.pdf$", filename, re.IGNORECASE):
-        if kwargs.get("parser_config", {}).get("layout_recognize", "DeepDOC") == "Plain Text":
-            pdf_parser = PlainParser()
-            paper = {
-                "title": filename,
-                "authors": " ",
-                "abstract": "",
-                "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
-                "tables": []
-            }
-        else:
-            pdf_parser = Pdf()
-            paper = pdf_parser(filename if not binary else binary,
-                               from_page=from_page, to_page=to_page, callback=callback)
-    else:
-        raise NotImplementedError("file type not supported yet(pdf supported)")
-
-    doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
-           "title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
-    # is it English
-    eng = lang.lower() == "english"  # pdf_parser.is_english
-    logging.debug("It's English.....{}".format(eng))
-
-    res = tokenize_table(paper["tables"], doc, eng)
-
-    if paper["abstract"]:
-        d = copy.deepcopy(doc)
-        txt = pdf_parser.remove_tag(paper["abstract"])
-        d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
-        d["important_tks"] = " ".join(d["important_kwd"])
-        d["image"], poss = pdf_parser.crop(
-            paper["abstract"], need_position=True)
-        add_positions(d, poss)
-        tokenize(d, txt, eng)
-        res.append(d)
-
-    sorted_sections = paper["sections"]
-    # set pivot using the most frequent type of title,
-    # then merge between 2 pivot
-    bull = bullets_category([txt for txt, _ in sorted_sections])
-    most_level, levels = title_frequency(bull, sorted_sections)
-    assert len(sorted_sections) == len(levels)
-    sec_ids = []
-    sid = 0
-    for i, lvl in enumerate(levels):
-        if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
-            sid += 1
-        sec_ids.append(sid)
-        logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
-
-    chunks = []
-    last_sid = -2
-    for (txt, _), sec_id in zip(sorted_sections, sec_ids):
-        if sec_id == last_sid:
-            if chunks:
-                chunks[-1] += "\n" + txt
-                continue
-        chunks.append(txt)
-        last_sid = sec_id
-    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
-    return res
-
-
-"""
-    readed = [0] * len(paper["lines"])
-    # find colon firstly
-    i = 0
-    while i + 1 < len(paper["lines"]):
-        txt = pdf_parser.remove_tag(paper["lines"][i][0])
-        j = i
-        if txt.strip("\n").strip()[-1] not in ":：":
-            i += 1
-            continue
-        i += 1
-        while i < len(paper["lines"]) and not paper["lines"][i][0]:
-            i += 1
-        if i >= len(paper["lines"]): break
-        proj = [paper["lines"][i][0].strip()]
-        i += 1
-        while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
-            proj.append(paper["lines"][i])
-            i += 1
-        for k in range(j, i): readed[k] = True
-        txt = txt[::-1]
-        if eng:
-            r = re.search(r"(.*?) ([\\.;?!]|$)", txt)
-            txt = r.group(1)[::-1] if r else txt[::-1]
-        else:
-            r = re.search(r"(.*?) ([。？；！]|$)", txt)
-            txt = r.group(1)[::-1] if r else txt[::-1]
-        for p in proj:
-            d = copy.deepcopy(doc)
-            txt += "\n" + pdf_parser.remove_tag(p)
-            d["image"], poss = pdf_parser.crop(p, need_position=True)
-            add_positions(d, poss)
-            tokenize(d, txt, eng)
-            res.append(d)
-
-    i = 0
-    chunk = []
-    tk_cnt = 0
-    def add_chunk():
-        nonlocal chunk, res, doc, pdf_parser, tk_cnt
-        d = copy.deepcopy(doc)
-        ck = "\n".join(chunk)
-        tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
-        d["image"], poss = pdf_parser.crop(ck, need_position=True)
-        add_positions(d, poss)
-        res.append(d)
-        chunk = []
-        tk_cnt = 0
-
-    while i < len(paper["lines"]):
-        if tk_cnt > 128:
-            add_chunk()
-        if readed[i]:
-            i += 1
-            continue
-        readed[i] = True
-        txt, layouts = paper["lines"][i]
-        txt_ = pdf_parser.remove_tag(txt)
-        i += 1
-        cnt = num_tokens_from_string(txt_)
-        if any([
-            layouts.find("title") >= 0 and chunk,
-            cnt + tk_cnt > 128 and tk_cnt > 32,
-        ]):
-            add_chunk()
-            chunk = [txt]
-            tk_cnt = cnt
-        else:
-            chunk.append(txt)
-            tk_cnt += cnt
-
-    if chunk: add_chunk()
-    for i, d in enumerate(res):
-        print(d)
-        # d["image"].save(f"./logs/{i}.jpg")
-    return res
-"""
-
-if __name__ == "__main__":
-    import sys
-
-    def dummy(prog=None, msg=""):
-        pass
-    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@ -1,59 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import io
-
-import numpy as np
-from PIL import Image
-
-from api.db import LLMType
-from api.db.services.llm_service import LLMBundle
-from rag.nlp import tokenize
-from deepdoc.vision import OCR
-
-ocr = OCR()
-
-
-def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
-    img = Image.open(io.BytesIO(binary)).convert('RGB')
-    doc = {
-        "docnm_kwd": filename,
-        "image": img
-    }
-    bxs = ocr(np.array(img))
-    txt = "\n".join([t[0] for _, t in bxs if t[0]])
-    eng = lang.lower() == "english"
-    callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
-    if (eng and len(txt.split()) > 32) or len(txt) > 32:
-        tokenize(doc, txt, eng)
-        callback(0.8, "OCR results is too long to use CV LLM.")
-        return [doc]
-
-    try:
-        callback(0.4, "Use CV LLM to describe the picture.")
-        cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
-        img_binary = io.BytesIO()
-        img.save(img_binary, format='JPEG')
-        img_binary.seek(0)
-        ans = cv_mdl.describe(img_binary.read())
-        callback(0.8, "CV LLM respond: %s ..." % ans[:32])
-        txt += "\n" + ans
-        tokenize(doc, txt, eng)
-        return [doc]
-    except Exception as e:
-        callback(prog=-1, msg=str(e))
-
-    return []
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@ -1,147 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import copy
-import re
-from io import BytesIO
-
-from PIL import Image
-
-from rag.nlp import tokenize, is_english
-from rag.nlp import rag_tokenizer
-from deepdoc.parser import PdfParser, PptParser, PlainParser
-from PyPDF2 import PdfReader as pdf2_read
-
-
-class Ppt(PptParser):
-    def __call__(self, fnm, from_page, to_page, callback=None):
-        txts = super().__call__(fnm, from_page, to_page)
-
-        callback(0.5, "Text extraction finished.")
-        import aspose.slides as slides
-        import aspose.pydrawing as drawing
-        imgs = []
-        with slides.Presentation(BytesIO(fnm)) as presentation:
-            for i, slide in enumerate(presentation.slides[from_page: to_page]):
-                buffered = BytesIO()
-                slide.get_thumbnail(
-                    0.5, 0.5).save(
-                    buffered, drawing.imaging.ImageFormat.jpeg)
-                imgs.append(Image.open(buffered))
-        assert len(imgs) == len(
-            txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
-        callback(0.9, "Image extraction finished")
-        self.is_english = is_english(txts)
-        return [(txts[i], imgs[i]) for i in range(len(txts))]
-
-
-class Pdf(PdfParser):
-    def __init__(self):
-        super().__init__()
-
-    def __garbage(self, txt):
-        txt = txt.lower().strip()
-        if re.match(r"[0-9\.,%/-]+$", txt):
-            return True
-        if len(txt) < 3:
-            return True
-        return False
-
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
-        from timeit import default_timer as timer
-        start = timer()
-        callback(msg="OCR started")
-        self.__images__(filename if not binary else binary,
-                        zoomin, from_page, to_page, callback)
-        callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
-        assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
-            len(self.boxes), len(self.page_images))
-        res = []
-        for i in range(len(self.boxes)):
-            lines = "\n".join([b["text"] for b in self.boxes[i]
-                              if not self.__garbage(b["text"])])
-            res.append((lines, self.page_images[i]))
-        callback(0.9, "Page {}~{}: Parsing finished".format(
-            from_page, min(to_page, self.total_page)))
-        return res
-
-
-class PlainPdf(PlainParser):
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, callback=None, **kwargs):
-        self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
-        page_txt = []
-        for page in self.pdf.pages[from_page: to_page]:
-            page_txt.append(page.extract_text())
-        callback(0.9, "Parsing finished")
-        return [(txt, None) for txt in page_txt]
-
-
-def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
-    """
-    The supported file formats are pdf, pptx.
-    Every page will be treated as a chunk. And the thumbnail of every page will be stored.
-    PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
-    """
-    eng = lang.lower() == "english"
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-    }
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    res = []
-    if re.search(r"\.pptx?$", filename, re.IGNORECASE):
-        ppt_parser = Ppt()
-        for pn, (txt, img) in enumerate(ppt_parser(
-                filename if not binary else binary, from_page, 1000000, callback)):
-            d = copy.deepcopy(doc)
-            pn += from_page
-            d["image"] = img
-            d["page_num_int"] = [pn + 1]
-            d["top_int"] = [0]
-            d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
-            tokenize(d, txt, eng)
-            res.append(d)
-        return res
-    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf()
-        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
-            pdf_parser = PlainParser()
-        for pn, (txt, img) in enumerate(pdf_parser(filename, binary,
-                                                   from_page=from_page, to_page=to_page, callback=callback)):
-            d = copy.deepcopy(doc)
-            pn += from_page
-            if img:
-                d["image"] = img
-            d["page_num_int"] = [pn + 1]
-            d["top_int"] = [0]
-            d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
-            tokenize(d, txt, eng)
-            res.append(d)
-        return res
-
-    raise NotImplementedError(
-        "file type not supported yet(pptx, pdf supported)")
-
-
-if __name__ == "__main__":
-    import sys
-
-    def dummy(a, b):
-        pass
-    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -21,17 +21,30 @@ from copy import deepcopy
 from io import BytesIO
 from timeit import default_timer as timer
 from openpyxl import load_workbook
-
-from deepdoc.parser.utils import get_text
 from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
 from rag.nlp import rag_tokenizer, tokenize_table, concat_img
-from deepdoc.parser import PdfParser, ExcelParser, DocxParser
 from docx import Document
 from PIL import Image
 from markdown import markdown
+from rag.nlp import find_codec


-class Excel(ExcelParser):
+def get_text(fnm: str, binary=None) -> str:
+    txt = ""
+    if binary:
+        encoding = find_codec(binary)
+        txt = binary.decode(encoding, errors="ignore")
+    else:
+        with open(fnm, "r") as f:
+            while True:
+                line = f.readline()
+                if not line:
+                    break
+                txt += line
+    return txt
+
+
+class Excel:
    def __call__(self, fnm, binary=None, callback=None):
        if not binary:
            wb = load_workbook(fnm)
@ -61,31 +74,18 @@ class Excel(ExcelParser):
                else:
                    fails.append(str(i + 1))
                if len(res) % 999 == 0:
-                    callback(len(res) *
-                             0.6 /
-                             total, ("Extract pairs: {}".format(len(res)) +
-                                     (f"{len(fails)} failure, line: %s..." %
-                                      (",".join(fails[:3])) if fails else "")))
+                    callback(len(res) * 0.6 / total, ("Extract pairs: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

-        callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (
-            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
-        self.is_english = is_english(
-            [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
+        callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        self.is_english = is_english([rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
        return res


-class Pdf(PdfParser):
-    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
+class Pdf:
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
        start = timer()
        callback(msg="OCR started")
-        self.__images__(
-            filename if not binary else binary,
-            zoomin,
-            from_page,
-            to_page,
-            callback
-        )
+        self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
        logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
        start = timer()
@ -100,9 +100,9 @@ class Pdf(PdfParser):
        self._text_merge()
        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
        tbls = self._extract_table_figure(True, zoomin, True, True)
-        #self._naive_vertical_merge()
+        # self._naive_vertical_merge()
        # self._concat_downward()
-        #self._filter_forpages()
+        # self._filter_forpages()
        logging.debug("layouts: {}".format(timer() - start))
        sections = [b["text"] for b in self.boxes]
        bull_x0_list = []
@ -110,57 +110,61 @@ class Pdf(PdfParser):
        if q_bull == -1:
            raise ValueError("Unable to recognize Q&A structure.")
        qai_list = []
-        last_q, last_a, last_tag = '', '', ''
+        last_q, last_a, last_tag = "", "", ""
        last_index = -1
-        last_box = {'text':''}
+        last_box = {"text": ""}
        last_bull = None
+
        def sort_key(element):
            tbls_pn = element[1][0][0]
            tbls_top = element[1][0][3]
            return tbls_pn, tbls_top
+
        tbls.sort(key=sort_key)
        tbl_index = 0
        last_pn, last_bottom = 0, 0
-        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
+        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, "@@0\t0\t0\t0\t0##", ""
        for box in self.boxes:
-            section, line_tag = box['text'], self._line_tag(box, zoomin)
+            section, line_tag = box["text"], self._line_tag(box, zoomin)
            has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
            last_box, last_index, last_bull = box, index, has_bull
-            line_pn = float(line_tag.lstrip('@@').split('\t')[0])
-            line_top = float(line_tag.rstrip('##').split('\t')[3])
+            line_pn = float(line_tag.lstrip("@@").split("\t")[0])
+            line_top = float(line_tag.rstrip("##").split("\t")[3])
            tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
            if not has_bull:  # No question bullet
                if not last_q:
-                    if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top):    # image passed
+                    if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top):  # image passed
                        tbl_index += 1
                    continue
                else:
                    sum_tag = line_tag
                    sum_section = section
-                    while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
-                        and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)):    # add image at the middle of current answer
-                        sum_tag = f'{tbl_tag}{sum_tag}'
-                        sum_section = f'{tbl_text}{sum_section}'
+                    while ((tbl_pn == last_pn and tbl_top >= last_bottom) or (tbl_pn > last_pn)) and (
+                        (tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)
+                    ):  # add image at the middle of current answer
+                        sum_tag = f"{tbl_tag}{sum_tag}"
+                        sum_section = f"{tbl_text}{sum_section}"
                        tbl_index += 1
                        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
-                    last_a = f'{last_a}{sum_section}'
-                    last_tag = f'{last_tag}{sum_tag}'
+                    last_a = f"{last_a}{sum_section}"
+                    last_tag = f"{last_tag}{sum_tag}"
            else:
                if last_q:
-                    while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
-                        and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)):    # add image at the end of last answer
-                        last_tag = f'{last_tag}{tbl_tag}'
-                        last_a = f'{last_a}{tbl_text}'
+                    while ((tbl_pn == last_pn and tbl_top >= last_bottom) or (tbl_pn > last_pn)) and (
+                        (tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)
+                    ):  # add image at the end of last answer
+                        last_tag = f"{last_tag}{tbl_tag}"
+                        last_a = f"{last_a}{tbl_text}"
                        tbl_index += 1
                        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
                    image, poss = self.crop(last_tag, need_position=True)
                    qai_list.append((last_q, last_a, image, poss))
-                    last_q, last_a, last_tag = '', '', ''
+                    last_q, last_a, last_tag = "", "", ""
                last_q = has_bull.group()
                _, end = has_bull.span()
                last_a = section[end:]
                last_tag = line_tag
-            last_bottom = float(line_tag.rstrip('##').split('\t')[4])
+            last_bottom = float(line_tag.rstrip("##").split("\t")[4])
            last_pn = line_pn
        if last_q:
            qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
@ -168,36 +172,34 @@ class Pdf(PdfParser):

    def get_tbls_info(self, tbls, tbl_index):
        if tbl_index >= len(tbls):
-            return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
-        tbl_pn = tbls[tbl_index][1][0][0]+1
+            return 1, 0, 0, 0, 0, "@@0\t0\t0\t0\t0##", ""
+        tbl_pn = tbls[tbl_index][1][0][0] + 1
        tbl_left = tbls[tbl_index][1][0][1]
        tbl_right = tbls[tbl_index][1][0][2]
        tbl_top = tbls[tbl_index][1][0][3]
        tbl_bottom = tbls[tbl_index][1][0][4]
-        tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
-            .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
-        _tbl_text = ''.join(tbls[tbl_index][0][1])
+        tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
+        _tbl_text = "".join(tbls[tbl_index][0][1])
        return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, _tbl_text


-class Docx(DocxParser):
+class Docx:
    def __init__(self):
        pass

    def get_picture(self, document, paragraph):
-        img = paragraph._element.xpath('.//pic:pic')
+        img = paragraph._element.xpath(".//pic:pic")
        if not img:
            return None
        img = img[0]
-        embed = img.xpath('.//a:blip/@r:embed')[0]
+        embed = img.xpath(".//a:blip/@r:embed")[0]
        related_part = document.part.related_parts[embed]
        image = related_part.image
-        image = Image.open(BytesIO(image.blob)).convert('RGB')
+        image = Image.open(BytesIO(image.blob)).convert("RGB")
        return image

    def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
-        self.doc = Document(
-            filename) if not binary else Document(BytesIO(binary))
+        self.doc = Document(filename) if not binary else Document(BytesIO(binary))
        pn = 0
        last_answer, last_image = "", None
        question_stack, level_stack = [], []
@ -205,19 +207,19 @@ class Docx(DocxParser):
        for p in self.doc.paragraphs:
            if pn > to_page:
                break
-            question_level, p_text = 0, ''
+            question_level, p_text = 0, ""
            if from_page <= pn < to_page and p.text.strip():
                question_level, p_text = docx_question_level(p)
-            if not question_level or question_level > 6: # not a question
-                last_answer = f'{last_answer}\n{p_text}'
+            if not question_level or question_level > 6:  # not a question
+                last_answer = f"{last_answer}\n{p_text}"
                current_image = self.get_picture(self.doc, p)
                last_image = concat_img(last_image, current_image)
-            else:   # is a question
+            else:  # is a question
                if last_answer or last_image:
-                    sum_question = '\n'.join(question_stack)
+                    sum_question = "\n".join(question_stack)
                    if sum_question:
                        qai_list.append((sum_question, last_answer, last_image))
-                    last_answer, last_image = '', None
+                    last_answer, last_image = "", None

                i = question_level
                while question_stack and i <= level_stack[-1]:
@ -226,26 +228,26 @@ class Docx(DocxParser):
                question_stack.append(p_text)
                level_stack.append(question_level)
            for run in p.runs:
-                if 'lastRenderedPageBreak' in run._element.xml:
+                if "lastRenderedPageBreak" in run._element.xml:
                    pn += 1
                    continue
-                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
                    pn += 1
        if last_answer:
-            sum_question = '\n'.join(question_stack)
+            sum_question = "\n".join(question_stack)
            if sum_question:
                qai_list.append((sum_question, last_answer, last_image))

        tbls = []
        for tb in self.doc.tables:
-            html= "<table>"
+            html = "<table>"
            for r in tb.rows:
                html += "<tr>"
                i = 0
                while i < len(r.cells):
                    span = 1
                    c = r.cells[i]
-                    for j in range(i+1, len(r.cells)):
+                    for j in range(i + 1, len(r.cells)):
                        if c.text == r.cells[j].text:
                            span += 1
                            i = j
@ -258,15 +260,13 @@ class Docx(DocxParser):


 def rmPrefix(txt):
-    return re.sub(
-        r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:： ]+", "", txt.strip(), flags=re.IGNORECASE)
+    return re.sub(r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:： ]+", "", txt.strip(), flags=re.IGNORECASE)


 def beAdocPdf(d, q, a, eng, image, poss):
    qprefix = "Question: " if eng else "问题："
    aprefix = "Answer: " if eng else "回答："
-    d["content_with_weight"] = "\t".join(
-        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_with_weight"] = "\t".join([qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
    d["content_ltks"] = rag_tokenizer.tokenize(q)
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    d["image"] = image
@ -277,8 +277,7 @@ def beAdocPdf(d, q, a, eng, image, poss):
 def beAdocDocx(d, q, a, eng, image, row_num=-1):
    qprefix = "Question: " if eng else "问题："
    aprefix = "Answer: " if eng else "回答："
-    d["content_with_weight"] = "\t".join(
-        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_with_weight"] = "\t".join([qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
    d["content_ltks"] = rag_tokenizer.tokenize(q)
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    d["image"] = image
@ -290,8 +289,7 @@ def beAdocDocx(d, q, a, eng, image, row_num=-1):
 def beAdoc(d, q, a, eng, row_num=-1):
    qprefix = "Question: " if eng else "问题："
    aprefix = "Answer: " if eng else "回答："
-    d["content_with_weight"] = "\t".join(
-        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_with_weight"] = "\t".join([qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
    d["content_ltks"] = rag_tokenizer.tokenize(q)
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    if row_num >= 0:
@ -300,28 +298,25 @@ def beAdoc(d, q, a, eng, row_num=-1):


 def mdQuestionLevel(s):
-    match = re.match(r'#*', s)
-    return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
+    match = re.match(r"#*", s)
+    return (len(match.group(0)), s.lstrip("#").lstrip()) if match else (0, s)


 def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
    """
-        Excel and csv(txt) format files are supported.
-        If the file is in excel format, there should be 2 column question and answer without header.
-        And question column is ahead of answer column.
-        And it's O.K if it has multiple sheets as long as the columns are rightly composed.
+    Excel and csv(txt) format files are supported.
+    If the file is in excel format, there should be 2 column question and answer without header.
+    And question column is ahead of answer column.
+    And it's O.K if it has multiple sheets as long as the columns are rightly composed.

-        If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
+    If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.

-        All the deformed lines will be ignored.
-        Every pair of Q&A will be treated as a chunk.
+    All the deformed lines will be ignored.
+    Every pair of Q&A will be treated as a chunk.
    """
    eng = lang.lower() == "english"
    res = []
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-    }
+    doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = Excel()
@ -350,21 +345,19 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                if question:
                    answer += "\n" + lines[i]
                else:
-                    fails.append(str(i+1))
+                    fails.append(str(i + 1))
            elif len(arr) == 2:
                if question and answer:
                    res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
                question, answer = arr
            i += 1
            if len(res) % 999 == 0:
-                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
-                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

        if question:
            res.append(beAdoc(deepcopy(doc), question, answer, eng, len(lines)))

-        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
-            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

        return res

@ -390,21 +383,18 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                    res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
                question, answer = row
            if len(res) % 999 == 0:
-                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
-                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

        if question:
            res.append(beAdoc(deepcopy(doc), question, answer, eng, len(list(reader))))

-        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
-            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        return res

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        pdf_parser = Pdf()
-        qai_list, tbls = pdf_parser(filename if not binary else binary,
-                                    from_page=0, to_page=10000, callback=callback)
+        qai_list, tbls = pdf_parser(filename if not binary else binary, from_page=0, to_page=10000, callback=callback)
        for q, a, image, poss in qai_list:
            res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
        return res
@ -417,20 +407,20 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
        question_stack, level_stack = [], []
        code_block = False
        for index, line in enumerate(lines):
-            if line.strip().startswith('```'):
+            if line.strip().startswith("```"):
                code_block = not code_block
-            question_level, question = 0, ''
+            question_level, question = 0, ""
            if not code_block:
                question_level, question = mdQuestionLevel(line)

-            if not question_level or question_level > 6: # not a question
-                last_answer = f'{last_answer}\n{line}'
-            else:   # is a question
+            if not question_level or question_level > 6:  # not a question
+                last_answer = f"{last_answer}\n{line}"
+            else:  # is a question
                if last_answer.strip():
-                    sum_question = '\n'.join(question_stack)
+                    sum_question = "\n".join(question_stack)
                    if sum_question:
-                        res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
-                    last_answer = ''
+                        res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=["markdown.extensions.tables"]), eng, index))
+                    last_answer = ""

                i = question_level
                while question_stack and i <= level_stack[-1]:
@ -439,22 +429,20 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                question_stack.append(question)
                level_stack.append(question_level)
        if last_answer.strip():
-            sum_question = '\n'.join(question_stack)
+            sum_question = "\n".join(question_stack)
            if sum_question:
-                res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
+                res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=["markdown.extensions.tables"]), eng, index))
        return res

    elif re.search(r"\.docx$", filename, re.IGNORECASE):
        docx_parser = Docx()
-        qai_list, tbls = docx_parser(filename, binary,
-                                    from_page=0, to_page=10000, callback=callback)
+        qai_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback)
        res = tokenize_table(tbls, doc, eng)
        for i, (q, a, image) in enumerate(qai_list):
            res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i))
        return res

-    raise NotImplementedError(
-        "Excel, csv(txt), pdf, markdown and docx format files are supported.")
+    raise NotImplementedError("Excel, csv(txt), pdf, markdown and docx format files are supported.")


 if __name__ == "__main__":
@ -462,4 +450,5 @@ if __name__ == "__main__":

    def dummy(prog=None, msg=""):
        pass
-    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
+
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/app/resume.py
+++ b/rag/app/resume.py
@ -16,56 +16,29 @@

 import logging
 import base64
-import datetime
 import json
 import re
-import pandas as pd
 import requests
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from rag.nlp import rag_tokenizer
-from deepdoc.parser.resume import refactor
-from deepdoc.parser.resume import step_one, step_two
 from rag.utils import rmSpace

-forbidden_select_fields4resume = [
-    "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
-]
+forbidden_select_fields4resume = ["name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"]


 def remote_call(filename, binary):
    q = {
-        "header": {
-            "uid": 1,
-            "user": "kevinhu",
-            "log_id": filename
-        },
+        "header": {"uid": 1, "user": "kevinhu", "log_id": filename},
        "request": {
-            "p": {
-                "request_id": "1",
-                "encrypt_type": "base64",
-                "filename": filename,
-                "langtype": '',
-                "fileori": base64.b64encode(binary).decode('utf-8')
-            },
+            "p": {"request_id": "1", "encrypt_type": "base64", "filename": filename, "langtype": "", "fileori": base64.b64encode(binary).decode("utf-8")},
            "c": "resume_parse_module",
-            "m": "resume_parse"
-        }
+            "m": "resume_parse",
+        },
    }
    for _ in range(3):
        try:
-            resume = requests.post(
-                "http://127.0.0.1:61670/tog",
-                data=json.dumps(q))
+            resume = requests.post("http://127.0.0.1:61670/tog", data=json.dumps(q))
            resume = resume.json()["response"]["results"]
-            resume = refactor(resume)
-            for k in ["education", "work", "project",
-                      "training", "skill", "certificate", "language"]:
-                if not resume.get(k) and k in resume:
-                    del resume[k]
-
-            resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
-                                                      "updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
-            resume = step_two.parse(resume)
            return resume
        except Exception:
            logging.exception("Resume parser has not been supported yet!")
@ -103,23 +76,19 @@ def chunk(filename, binary=None, callback=None, **kwargs):
        "expect_city_names_tks": "期望城市",
        "work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
        "corporation_name_tks": "最近就职(上班)的公司/上一家公司",
-
        "first_school_name_tks": "第一学历毕业学校",
        "first_degree_kwd": "第一学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
        "highest_degree_kwd": "最高学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
        "first_major_tks": "第一学历专业",
        "edu_first_fea_kwd": "第一学历标签（211，留学，双一流，985，海外知名，重点大学，中专，专升本，专科，本科，大专）",
-
        "degree_kwd": "过往学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
        "major_tks": "学过的专业/过往专业",
        "school_name_tks": "学校/毕业院校",
        "sch_rank_kwd": "学校标签（顶尖学校，精英学校，优质学校，一般学校）",
        "edu_fea_kwd": "教育标签（211，留学，双一流，985，海外知名，重点大学，中专，专升本，专科，本科，大专）",
-
        "corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
        "edu_end_int": "毕业年份",
        "industry_name_tks": "所在行业",
-
        "birth_dt": "生日/出生年份",
        "expect_position_name_tks": "期望职位/期望职能/期望岗位",
    }
@ -132,10 +101,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
        if n.find("tks") > 0:
            v = rmSpace(v)
        titles.append(str(v))
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
-    }
+    doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")}
    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
    pairs = []
    for n, m in field_map.items():
@ -148,23 +114,20 @@ def chunk(filename, binary=None, callback=None, **kwargs):
            v = rmSpace(v)
        pairs.append((m, str(v)))

-    doc["content_with_weight"] = "\n".join(
-        ["{}: {}".format(re.sub(r"（[^（）]+）", "", k), v) for k, v in pairs])
+    doc["content_with_weight"] = "\n".join(["{}: {}".format(re.sub(r"（[^（）]+）", "", k), v) for k, v in pairs])
    doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
    doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
    for n, _ in field_map.items():
        if n not in resume:
            continue
-        if isinstance(resume[n], list) and (
-                len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
+        if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
            resume[n] = resume[n][0]
        if n.find("_tks") > 0:
            resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
        doc[n] = resume[n]

    logging.debug("chunked resume to " + str(doc))
-    KnowledgebaseService.update_parser_config(
-        kwargs["kb_id"], {"field_map": field_map})
+    KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": field_map})
    return [doc]


@ -173,4 +136,5 @@ if __name__ == "__main__":

    def dummy(a, b):
        pass
+
    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -1,250 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import copy
-import re
-from io import BytesIO
-from xpinyin import Pinyin
-import numpy as np
-import pandas as pd
-# from openpyxl import load_workbook, Workbook
-from dateutil.parser import parse as datetime_parse
-
-from api.db.services.knowledgebase_service import KnowledgebaseService
-from deepdoc.parser.utils import get_text
-from rag.nlp import rag_tokenizer, tokenize
-from deepdoc.parser import ExcelParser
-
-
-class Excel(ExcelParser):
-    def __call__(self, fnm, binary=None, from_page=0,
-                 to_page=10000000000, callback=None):
-        if not binary:
-            wb = Excel._load_excel_to_workbook(fnm)
-        else:
-            wb = Excel._load_excel_to_workbook(BytesIO(binary))
-        total = 0
-        for sheetname in wb.sheetnames:
-            total += len(list(wb[sheetname].rows))
-
-        res, fails, done = [], [], 0
-        rn = 0
-        for sheetname in wb.sheetnames:
-            ws = wb[sheetname]
-            rows = list(ws.rows)
-            if not rows:
-                continue
-            headers = [cell.value for cell in rows[0]]
-            missed = set([i for i, h in enumerate(headers) if h is None])
-            headers = [
-                cell.value for i,
-                cell in enumerate(
-                    rows[0]) if i not in missed]
-            if not headers:
-                continue
-            data = []
-            for i, r in enumerate(rows[1:]):
-                rn += 1
-                if rn - 1 < from_page:
-                    continue
-                if rn - 1 >= to_page:
-                    break
-                row = [
-                    cell.value for ii,
-                    cell in enumerate(r) if ii not in missed]
-                if len(row) != len(headers):
-                    fails.append(str(i))
-                    continue
-                data.append(row)
-                done += 1
-            if np.array(data).size == 0:
-                continue
-            res.append(pd.DataFrame(np.array(data), columns=headers))
-
-        callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
-            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
-        return res
-
-
-def trans_datatime(s):
-    try:
-        return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
-    except Exception:
-        pass
-
-
-def trans_bool(s):
-    if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$",
-                str(s).strip(), flags=re.IGNORECASE):
-        return "yes"
-    if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
-        return "no"
-
-
-def column_data_type(arr):
-    arr = list(arr)
-    counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
-    trans = {t: f for f, t in
-             [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
-    for a in arr:
-        if a is None:
-            continue
-        if re.match(r"[+-]?[0-9]{,19}(\.0+)?$", str(a).replace("%%", "")):
-            counts["int"] += 1
-        elif re.match(r"[+-]?[0-9.]{,19}$", str(a).replace("%%", "")):
-            counts["float"] += 1
-        elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE):
-            counts["bool"] += 1
-        elif trans_datatime(str(a)):
-            counts["datetime"] += 1
-        else:
-            counts["text"] += 1
-    counts = sorted(counts.items(), key=lambda x: x[1] * -1)
-    ty = counts[0][0]
-    for i in range(len(arr)):
-        if arr[i] is None:
-            continue
-        try:
-            arr[i] = trans[ty](str(arr[i]))
-        except Exception:
-            arr[i] = None
-    # if ty == "text":
-    #    if len(arr) > 128 and uni / len(arr) < 0.1:
-    #        ty = "keyword"
-    return arr, ty
-
-
-def chunk(filename, binary=None, from_page=0, to_page=10000000000,
-          lang="Chinese", callback=None, **kwargs):
-    """
-        Excel and csv(txt) format files are supported.
-        For csv or txt file, the delimiter between columns is TAB.
-        The first line must be column headers.
-        Column headers must be meaningful terms inorder to make our NLP model understanding.
-        It's good to enumerate some synonyms using slash '/' to separate, and even better to
-        enumerate values using brackets like 'gender/sex(male, female)'.
-        Here are some examples for headers:
-            1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
-            2. 姓名/名字\t电话/手机/微信\t最高学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）
-
-        Every row in table will be treated as a chunk.
-    """
-
-    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        excel_parser = Excel()
-        dfs = excel_parser(
-            filename,
-            binary,
-            from_page=from_page,
-            to_page=to_page,
-            callback=callback)
-    elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
-        callback(0.1, "Start to parse.")
-        txt = get_text(filename, binary)
-        lines = txt.split("\n")
-        fails = []
-        headers = lines[0].split(kwargs.get("delimiter", "\t"))
-        rows = []
-        for i, line in enumerate(lines[1:]):
-            if i < from_page:
-                continue
-            if i >= to_page:
-                break
-            row = [field for field in line.split(kwargs.get("delimiter", "\t"))]
-            if len(row) != len(headers):
-                fails.append(str(i))
-                continue
-            rows.append(row)
-
-        callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (
-            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
-
-        dfs = [pd.DataFrame(np.array(rows), columns=headers)]
-
-    else:
-        raise NotImplementedError(
-            "file type not supported yet(excel, text, csv supported)")
-
-    res = []
-    PY = Pinyin()
-    fieds_map = {
-        "text": "_tks",
-        "int": "_long",
-        "keyword": "_kwd",
-        "float": "_flt",
-        "datetime": "_dt",
-        "bool": "_kwd"}
-    for df in dfs:
-        for n in ["id", "_id", "index", "idx"]:
-            if n in df.columns:
-                del df[n]
-        clmns = df.columns.values
-        txts = list(copy.deepcopy(clmns))
-        py_clmns = [
-            PY.get_pinyins(
-                re.sub(
-                    r"(/.*|（[^（）]+?）|\([^()]+?\))",
-                    "",
-                    str(n)),
-                '_')[0] for n in clmns]
-        clmn_tys = []
-        for j in range(len(clmns)):
-            cln, ty = column_data_type(df[clmns[j]])
-            clmn_tys.append(ty)
-            df[clmns[j]] = cln
-            if ty == "text":
-                txts.extend([str(c) for c in cln if c])
-        clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " "))
-                     for i in range(len(clmns))]
-
-        eng = lang.lower() == "english"  # is_english(txts)
-        for ii, row in df.iterrows():
-            d = {
-                "docnm_kwd": filename,
-                "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
-            }
-            row_txt = []
-            for j in range(len(clmns)):
-                if row[clmns[j]] is None:
-                    continue
-                if not str(row[clmns[j]]):
-                    continue
-                if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]):
-                    continue
-                fld = clmns_map[j][0]
-                d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
-                    row[clmns[j]])
-                row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
-            if not row_txt:
-                continue
-            tokenize(d, "; ".join(row_txt), eng)
-            res.append(d)
-
-        KnowledgebaseService.update_parser_config(
-            kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
-    callback(0.35, "")
-
-    return res
-
-
-if __name__ == "__main__":
-    import sys
-
-    def dummy(prog=None, msg=""):
-        pass
-
-    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/tag.py
+++ b/rag/app/tag.py
@ -17,10 +17,24 @@ import json
 import re
 import csv
 from copy import deepcopy
-
-from deepdoc.parser.utils import get_text
 from rag.app.qa import Excel
 from rag.nlp import rag_tokenizer
+from rag.nlp import find_codec
+
+
+def get_text(fnm: str, binary=None) -> str:
+    txt = ""
+    if binary:
+        encoding = find_codec(binary)
+        txt = binary.decode(encoding, errors="ignore")
+    else:
+        with open(fnm, "r") as f:
+            while True:
+                line = f.readline()
+                if not line:
+                    break
+                txt += line
+    return txt


 def beAdoc(d, q, a, eng, row_num=-1):