RAGflow/rag/nlp/synonym.py

101 lines
3.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import json
import os
import time
import re
from nltk.corpus import wordnet
from api.utils.file_utils import get_project_base_directory
class Dealer:
def __init__(self, redis=None):
self.lookup_num = 100000000
self.load_tm = time.time() - 1000000
self.dictionary = None
path = os.path.join(get_project_base_directory(), "rag/res", "synonym.json")
try:
self.dictionary = json.load(open(path, "r"))
except Exception:
logging.warning("Missing synonym.json")
self.dictionary = {}
if not redis:
logging.warning("Realtime synonym is disabled, since no redis connection.")
if not len(self.dictionary.keys()):
logging.warning("Fail to load synonym")
self.redis = redis
self.load()
def load(self):
if not self.redis:
return
if self.lookup_num < 100:
return
tm = time.time()
if tm - self.load_tm < 3600:
return
self.load_tm = time.time()
self.lookup_num = 0
d = self.redis.get("kevin_synonyms")
if not d:
return
try:
d = json.loads(d)
self.dictionary = d
except Exception as e:
logging.error("Fail to load synonym!" + str(e))
def lookup(self, tk, topn=8):
"""
查找输入词条(tk)的同义词,支持英文和中文混合处理
参数:
tk (str): 待查询的词条(如"happy""苹果"
topn (int): 最多返回的同义词数量默认为8
返回:
list: 同义词列表,可能为空(无同义词时)
处理逻辑:
1. 英文单词使用WordNet语义网络查询
2. 中文/其他:从预加载的自定义词典查询
"""
# 英文单词处理分支
if re.match(r"[a-z]+$", tk):
res = list(set([re.sub("_", " ", syn.name().split(".")[0]) for syn in wordnet.synsets(tk)]) - set([tk]))
return [t for t in res if t]
# 中文/其他词条处理
self.lookup_num += 1
self.load() # 自定义词典
# 从字典获取同义词,默认返回空列表
res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), [])
# 兼容处理:如果字典值是字符串,转为单元素列表
if isinstance(res, str):
res = [res]
return res[:topn]
if __name__ == "__main__":
dl = Dealer()
print(dl.dictionary)