parent
586b0f7305
commit
a51b3168a0
|
@ -239,6 +239,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
||||||
prompt4citation = ""
|
prompt4citation = ""
|
||||||
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
||||||
prompt4citation = citation_prompt()
|
prompt4citation = citation_prompt()
|
||||||
|
# 过滤掉 system 角色的消息(因为前面已经单独处理了系统消息)
|
||||||
msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])}
|
msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])}
|
||||||
for m in messages if m["role"] != "system"])
|
for m in messages if m["role"] != "system"])
|
||||||
used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.95))
|
used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.95))
|
||||||
|
@ -309,16 +310,20 @@ def chat(dialog, messages, stream=True, **kwargs):
|
||||||
return {"answer": think+answer, "reference": refs, "prompt": re.sub(r"\n", " \n", prompt), "created_at": time.time()}
|
return {"answer": think+answer, "reference": refs, "prompt": re.sub(r"\n", " \n", prompt), "created_at": time.time()}
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
last_ans = ""
|
last_ans = "" # 记录上一次返回的完整回答
|
||||||
answer = ""
|
answer = "" # 当前累计的完整回答
|
||||||
for ans in chat_mdl.chat_streamly(prompt+prompt4citation, msg[1:], gen_conf):
|
for ans in chat_mdl.chat_streamly(prompt+prompt4citation, msg[1:], gen_conf):
|
||||||
|
# 如果存在思考过程(thought),移除相关标记
|
||||||
if thought:
|
if thought:
|
||||||
ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)
|
ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)
|
||||||
answer = ans
|
answer = ans
|
||||||
|
# 计算新增的文本片段(delta)
|
||||||
delta_ans = ans[len(last_ans):]
|
delta_ans = ans[len(last_ans):]
|
||||||
|
# 如果新增token太少(小于16),跳过本次返回(避免频繁发送小片段)
|
||||||
if num_tokens_from_string(delta_ans) < 16:
|
if num_tokens_from_string(delta_ans) < 16:
|
||||||
continue
|
continue
|
||||||
last_ans = answer
|
last_ans = answer
|
||||||
|
# 返回当前累计回答(包含思考过程)+新增片段)
|
||||||
yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)}
|
yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)}
|
||||||
delta_ans = answer[len(last_ans):]
|
delta_ans = answer[len(last_ans):]
|
||||||
if delta_ans:
|
if delta_ans:
|
||||||
|
@ -471,20 +476,48 @@ def tts(tts_mdl, text):
|
||||||
|
|
||||||
|
|
||||||
def ask(question, kb_ids, tenant_id):
|
def ask(question, kb_ids, tenant_id):
|
||||||
|
"""
|
||||||
|
处理用户搜索请求,从知识库中检索相关信息并生成回答
|
||||||
|
|
||||||
|
参数:
|
||||||
|
question (str): 用户的问题或查询
|
||||||
|
kb_ids (list): 知识库ID列表,指定要搜索的知识库
|
||||||
|
tenant_id (str): 租户ID,用于权限控制和资源隔离
|
||||||
|
|
||||||
|
流程:
|
||||||
|
1. 获取指定知识库的信息
|
||||||
|
2. 确定使用的嵌入模型
|
||||||
|
3. 根据知识库类型选择检索器(普通检索器或知识图谱检索器)
|
||||||
|
4. 初始化嵌入模型和聊天模型
|
||||||
|
5. 执行检索操作获取相关文档片段
|
||||||
|
6. 格式化知识库内容作为上下文
|
||||||
|
7. 构建系统提示词
|
||||||
|
8. 生成回答并添加引用标记
|
||||||
|
9. 流式返回生成的回答
|
||||||
|
|
||||||
|
返回:
|
||||||
|
generator: 生成器对象,产生包含回答和引用信息的字典
|
||||||
|
"""
|
||||||
|
|
||||||
kbs = KnowledgebaseService.get_by_ids(kb_ids)
|
kbs = KnowledgebaseService.get_by_ids(kb_ids)
|
||||||
embedding_list = list(set([kb.embd_id for kb in kbs]))
|
embedding_list = list(set([kb.embd_id for kb in kbs]))
|
||||||
|
|
||||||
is_knowledge_graph = all([kb.parser_id == ParserType.KG for kb in kbs])
|
is_knowledge_graph = all([kb.parser_id == ParserType.KG for kb in kbs])
|
||||||
retriever = settings.retrievaler if not is_knowledge_graph else settings.kg_retrievaler
|
retriever = settings.retrievaler if not is_knowledge_graph else settings.kg_retrievaler
|
||||||
|
# 初始化嵌入模型,用于将文本转换为向量表示
|
||||||
embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING, embedding_list[0])
|
embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING, embedding_list[0])
|
||||||
|
# 初始化聊天模型,用于生成回答
|
||||||
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT)
|
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT)
|
||||||
|
# 获取聊天模型的最大token长度,用于控制上下文长度
|
||||||
max_tokens = chat_mdl.max_length
|
max_tokens = chat_mdl.max_length
|
||||||
|
# 获取所有知识库的租户ID并去重
|
||||||
tenant_ids = list(set([kb.tenant_id for kb in kbs]))
|
tenant_ids = list(set([kb.tenant_id for kb in kbs]))
|
||||||
|
# 调用检索器检索相关文档片段
|
||||||
kbinfos = retriever.retrieval(question, embd_mdl, tenant_ids, kb_ids,
|
kbinfos = retriever.retrieval(question, embd_mdl, tenant_ids, kb_ids,
|
||||||
1, 12, 0.1, 0.3, aggs=False,
|
1, 12, 0.1, 0.3, aggs=False,
|
||||||
rank_feature=label_question(question, kbs)
|
rank_feature=label_question(question, kbs)
|
||||||
)
|
)
|
||||||
|
# 将检索结果格式化为提示词,并确保不超过模型最大token限制
|
||||||
knowledges = kb_prompt(kbinfos, max_tokens)
|
knowledges = kb_prompt(kbinfos, max_tokens)
|
||||||
prompt = """
|
prompt = """
|
||||||
Role: You're a smart assistant. Your name is Miss R.
|
Role: You're a smart assistant. Your name is Miss R.
|
||||||
|
@ -504,6 +537,7 @@ def ask(question, kb_ids, tenant_id):
|
||||||
""" % "\n".join(knowledges)
|
""" % "\n".join(knowledges)
|
||||||
msg = [{"role": "user", "content": question}]
|
msg = [{"role": "user", "content": question}]
|
||||||
|
|
||||||
|
# 生成完成后添加回答中的引用标记
|
||||||
def decorate_answer(answer):
|
def decorate_answer(answer):
|
||||||
nonlocal knowledges, kbinfos, prompt
|
nonlocal knowledges, kbinfos, prompt
|
||||||
answer, idx = retriever.insert_citations(answer,
|
answer, idx = retriever.insert_citations(answer,
|
||||||
|
@ -534,4 +568,4 @@ def ask(question, kb_ids, tenant_id):
|
||||||
for ans in chat_mdl.chat_streamly(prompt, msg, {"temperature": 0.1}):
|
for ans in chat_mdl.chat_streamly(prompt, msg, {"temperature": 0.1}):
|
||||||
answer = ans
|
answer = ans
|
||||||
yield {"answer": answer, "reference": {}}
|
yield {"answer": answer, "reference": {}}
|
||||||
yield decorate_answer(answer)
|
yield decorate_answer(answer)
|
|
@ -71,13 +71,27 @@ class FulltextQueryer:
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def question(self, txt, tbl="qa", min_match: float = 0.6):
|
def question(self, txt, tbl="qa", min_match: float = 0.6):
|
||||||
|
"""
|
||||||
|
处理用户问题并生成全文检索表达式
|
||||||
|
|
||||||
|
参数:
|
||||||
|
txt: 原始问题文本
|
||||||
|
tbl: 查询表名(默认"qa")
|
||||||
|
min_match: 最小匹配阈值(默认0.6)
|
||||||
|
|
||||||
|
返回:
|
||||||
|
MatchTextExpr: 全文检索表达式对象
|
||||||
|
list: 提取的关键词列表
|
||||||
|
"""
|
||||||
|
# 1. 文本预处理:去除特殊字符、繁体转简体、全角转半角、转小写
|
||||||
txt = re.sub(
|
txt = re.sub(
|
||||||
r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+",
|
r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+",
|
||||||
" ",
|
" ",
|
||||||
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
|
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
|
||||||
).strip()
|
).strip()
|
||||||
txt = FulltextQueryer.rmWWW(txt)
|
txt = FulltextQueryer.rmWWW(txt) # 去除停用词
|
||||||
|
|
||||||
|
# 2. 非中文文本处理
|
||||||
if not self.isChinese(txt):
|
if not self.isChinese(txt):
|
||||||
txt = FulltextQueryer.rmWWW(txt)
|
txt = FulltextQueryer.rmWWW(txt)
|
||||||
tks = rag_tokenizer.tokenize(txt).split()
|
tks = rag_tokenizer.tokenize(txt).split()
|
||||||
|
@ -117,30 +131,43 @@ class FulltextQueryer:
|
||||||
), keywords
|
), keywords
|
||||||
|
|
||||||
def need_fine_grained_tokenize(tk):
|
def need_fine_grained_tokenize(tk):
|
||||||
|
"""
|
||||||
|
判断是否需要细粒度分词
|
||||||
|
参数:
|
||||||
|
tk: 待判断的词条
|
||||||
|
返回:
|
||||||
|
bool: True表示需要细粒度分词
|
||||||
|
"""
|
||||||
if len(tk) < 3:
|
if len(tk) < 3:
|
||||||
return False
|
return False
|
||||||
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
|
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
txt = FulltextQueryer.rmWWW(txt)
|
txt = FulltextQueryer.rmWWW(txt) # 二次去除停用词
|
||||||
qs, keywords = [], []
|
qs, keywords = [], [] # 初始化查询表达式和关键词列表
|
||||||
|
# 3. 中文文本处理(最多处理256个词)
|
||||||
for tt in self.tw.split(txt)[:256]: # .split():
|
for tt in self.tw.split(txt)[:256]: # .split():
|
||||||
if not tt:
|
if not tt:
|
||||||
continue
|
continue
|
||||||
|
# 3.1 基础关键词收集
|
||||||
keywords.append(tt)
|
keywords.append(tt)
|
||||||
twts = self.tw.weights([tt])
|
twts = self.tw.weights([tt]) # 获取词权重
|
||||||
syns = self.syn.lookup(tt)
|
syns = self.syn.lookup(tt) # 查询同义词
|
||||||
|
# 3.2 同义词扩展(最多扩展到32个关键词)
|
||||||
if syns and len(keywords) < 32:
|
if syns and len(keywords) < 32:
|
||||||
keywords.extend(syns)
|
keywords.extend(syns)
|
||||||
logging.debug(json.dumps(twts, ensure_ascii=False))
|
logging.debug(json.dumps(twts, ensure_ascii=False))
|
||||||
tms = []
|
tms = []
|
||||||
|
# 3.3 处理每个词及其权重
|
||||||
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
||||||
|
# 3.3.1 细粒度分词处理
|
||||||
sm = (
|
sm = (
|
||||||
rag_tokenizer.fine_grained_tokenize(tk).split()
|
rag_tokenizer.fine_grained_tokenize(tk).split()
|
||||||
if need_fine_grained_tokenize(tk)
|
if need_fine_grained_tokenize(tk)
|
||||||
else []
|
else []
|
||||||
)
|
)
|
||||||
|
# 3.3.2 清洗分词结果
|
||||||
sm = [
|
sm = [
|
||||||
re.sub(
|
re.sub(
|
||||||
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
|
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
|
||||||
|
@ -151,36 +178,41 @@ class FulltextQueryer:
|
||||||
]
|
]
|
||||||
sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
|
sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
|
||||||
sm = [m for m in sm if len(m) > 1]
|
sm = [m for m in sm if len(m) > 1]
|
||||||
|
# 3.3.3 收集关键词(不超过32个)
|
||||||
if len(keywords) < 32:
|
if len(keywords) < 32:
|
||||||
keywords.append(re.sub(r"[ \\\"']+", "", tk))
|
keywords.append(re.sub(r"[ \\\"']+", "", tk))
|
||||||
keywords.extend(sm)
|
keywords.extend(sm)
|
||||||
|
|
||||||
|
# 3.3.4 同义词处理
|
||||||
tk_syns = self.syn.lookup(tk)
|
tk_syns = self.syn.lookup(tk)
|
||||||
tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
|
tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
|
||||||
if len(keywords) < 32:
|
if len(keywords) < 32:
|
||||||
keywords.extend([s for s in tk_syns if s])
|
keywords.extend([s for s in tk_syns if s])
|
||||||
tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
|
tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
|
||||||
tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns]
|
tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns]
|
||||||
|
# 关键词数量限制
|
||||||
if len(keywords) >= 32:
|
if len(keywords) >= 32:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# 3.3.5 构建查询表达式
|
||||||
tk = FulltextQueryer.subSpecialChar(tk)
|
tk = FulltextQueryer.subSpecialChar(tk)
|
||||||
if tk.find(" ") > 0:
|
if tk.find(" ") > 0:
|
||||||
tk = '"%s"' % tk
|
tk = '"%s"' % tk # 处理短语查询
|
||||||
if tk_syns:
|
if tk_syns:
|
||||||
tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns)
|
tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns) # 添加同义词查询
|
||||||
if sm:
|
if sm:
|
||||||
tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
|
tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm)) # 添加细粒度分词查询
|
||||||
if tk.strip():
|
if tk.strip():
|
||||||
tms.append((tk, w))
|
tms.append((tk, w)) # 保存带权重的查询表达式
|
||||||
|
|
||||||
|
# 3.4 合并当前词的查询表达式
|
||||||
tms = " ".join([f"({t})^{w}" for t, w in tms])
|
tms = " ".join([f"({t})^{w}" for t, w in tms])
|
||||||
|
|
||||||
|
# 3.5 添加相邻词组合查询(提升短语匹配权重)
|
||||||
if len(twts) > 1:
|
if len(twts) > 1:
|
||||||
tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt)
|
tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt)
|
||||||
|
|
||||||
|
# 3.6 处理同义词查询表达式
|
||||||
syns = " OR ".join(
|
syns = " OR ".join(
|
||||||
[
|
[
|
||||||
'"%s"'
|
'"%s"'
|
||||||
|
@ -191,9 +223,10 @@ class FulltextQueryer:
|
||||||
if syns and tms:
|
if syns and tms:
|
||||||
tms = f"({tms})^5 OR ({syns})^0.7"
|
tms = f"({tms})^5 OR ({syns})^0.7"
|
||||||
|
|
||||||
qs.append(tms)
|
qs.append(tms) # 添加到最终查询列表
|
||||||
|
|
||||||
if qs:
|
# 4. 生成最终查询表达式
|
||||||
|
if qs:
|
||||||
query = " OR ".join([f"({t})" for t in qs if t])
|
query = " OR ".join([f"({t})" for t in qs if t])
|
||||||
return MatchTextExpr(
|
return MatchTextExpr(
|
||||||
self.query_fields, query, 100, {"minimum_should_match": min_match}
|
self.query_fields, query, 100, {"minimum_should_match": min_match}
|
||||||
|
|
|
@ -70,14 +70,44 @@ class Dealer:
|
||||||
highlight=False,
|
highlight=False,
|
||||||
rank_feature: dict | None = None
|
rank_feature: dict | None = None
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
执行混合检索(全文检索+向量检索)
|
||||||
|
|
||||||
|
参数:
|
||||||
|
req: 请求参数字典,包含:
|
||||||
|
- page: 页码
|
||||||
|
- topk: 返回结果最大数量
|
||||||
|
- size: 每页大小
|
||||||
|
- fields: 指定返回字段
|
||||||
|
- question: 查询问题文本
|
||||||
|
- similarity: 向量相似度阈值
|
||||||
|
idx_names: 索引名称或列表
|
||||||
|
kb_ids: 知识库ID列表
|
||||||
|
emb_mdl: 嵌入模型,用于向量检索
|
||||||
|
highlight: 是否返回高亮内容
|
||||||
|
rank_feature: 排序特征配置
|
||||||
|
|
||||||
|
返回:
|
||||||
|
SearchResult对象,包含:
|
||||||
|
- total: 匹配总数
|
||||||
|
- ids: 匹配的chunk ID列表
|
||||||
|
- query_vector: 查询向量
|
||||||
|
- field: 各chunk的字段值
|
||||||
|
- highlight: 高亮内容
|
||||||
|
- aggregation: 聚合结果
|
||||||
|
- keywords: 提取的关键词
|
||||||
|
"""
|
||||||
|
# 1. 初始化过滤条件和排序规则
|
||||||
filters = self.get_filters(req)
|
filters = self.get_filters(req)
|
||||||
orderBy = OrderByExpr()
|
orderBy = OrderByExpr()
|
||||||
|
|
||||||
|
# 2. 处理分页参数
|
||||||
pg = int(req.get("page", 1)) - 1
|
pg = int(req.get("page", 1)) - 1
|
||||||
topk = int(req.get("topk", 1024))
|
topk = int(req.get("topk", 1024))
|
||||||
ps = int(req.get("size", topk))
|
ps = int(req.get("size", topk))
|
||||||
offset, limit = pg * ps, ps
|
offset, limit = pg * ps, ps
|
||||||
|
|
||||||
|
# 3. 设置返回字段(默认包含文档名、内容等核心字段)
|
||||||
src = req.get("fields",
|
src = req.get("fields",
|
||||||
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
||||||
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
|
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
|
||||||
|
@ -85,9 +115,11 @@ class Dealer:
|
||||||
"available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
|
"available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
|
||||||
kwds = set([])
|
kwds = set([])
|
||||||
|
|
||||||
|
# 4. 处理查询问题
|
||||||
qst = req.get("question", "")
|
qst = req.get("question", "")
|
||||||
q_vec = []
|
q_vec = []
|
||||||
if not qst:
|
if not qst:
|
||||||
|
# 4.1 无查询文本时的处理(按文档排序)
|
||||||
if req.get("sort"):
|
if req.get("sort"):
|
||||||
orderBy.asc("page_num_int")
|
orderBy.asc("page_num_int")
|
||||||
orderBy.asc("top_int")
|
orderBy.asc("top_int")
|
||||||
|
@ -96,22 +128,29 @@ class Dealer:
|
||||||
total = self.dataStore.getTotal(res)
|
total = self.dataStore.getTotal(res)
|
||||||
logging.debug("Dealer.search TOTAL: {}".format(total))
|
logging.debug("Dealer.search TOTAL: {}".format(total))
|
||||||
else:
|
else:
|
||||||
|
# 4.2 有查询文本时的处理
|
||||||
highlightFields = ["content_ltks", "title_tks"] if highlight else []
|
highlightFields = ["content_ltks", "title_tks"] if highlight else []
|
||||||
|
|
||||||
|
# 4.2.1 生成全文检索表达式和关键词
|
||||||
matchText, keywords = self.qryr.question(qst, min_match=0.3)
|
matchText, keywords = self.qryr.question(qst, min_match=0.3)
|
||||||
if emb_mdl is None:
|
if emb_mdl is None:
|
||||||
|
# 4.2.2 纯全文检索模式
|
||||||
matchExprs = [matchText]
|
matchExprs = [matchText]
|
||||||
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit,
|
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit,
|
||||||
idx_names, kb_ids, rank_feature=rank_feature)
|
idx_names, kb_ids, rank_feature=rank_feature)
|
||||||
total = self.dataStore.getTotal(res)
|
total = self.dataStore.getTotal(res)
|
||||||
logging.debug("Dealer.search TOTAL: {}".format(total))
|
logging.debug("Dealer.search TOTAL: {}".format(total))
|
||||||
else:
|
else:
|
||||||
|
# 4.2.3 混合检索模式(全文+向量)
|
||||||
|
# 生成查询向量
|
||||||
matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
|
matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
|
||||||
q_vec = matchDense.embedding_data
|
q_vec = matchDense.embedding_data
|
||||||
src.append(f"q_{len(q_vec)}_vec")
|
src.append(f"q_{len(q_vec)}_vec")
|
||||||
|
# 设置混合检索权重(全文5% + 向量95%)
|
||||||
fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"})
|
fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"})
|
||||||
matchExprs = [matchText, matchDense, fusionExpr]
|
matchExprs = [matchText, matchDense, fusionExpr]
|
||||||
|
|
||||||
|
# 执行混合检索
|
||||||
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit,
|
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit,
|
||||||
idx_names, kb_ids, rank_feature=rank_feature)
|
idx_names, kb_ids, rank_feature=rank_feature)
|
||||||
total = self.dataStore.getTotal(res)
|
total = self.dataStore.getTotal(res)
|
||||||
|
@ -340,48 +379,86 @@ class Dealer:
|
||||||
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True,
|
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True,
|
||||||
rerank_mdl=None, highlight=False,
|
rerank_mdl=None, highlight=False,
|
||||||
rank_feature: dict | None = {PAGERANK_FLD: 10}):
|
rank_feature: dict | None = {PAGERANK_FLD: 10}):
|
||||||
|
"""
|
||||||
|
执行检索操作,根据问题查询相关文档片段
|
||||||
|
|
||||||
|
参数说明:
|
||||||
|
- question: 用户输入的查询问题
|
||||||
|
- embd_mdl: 嵌入模型,用于将文本转换为向量
|
||||||
|
- tenant_ids: 租户ID,可以是字符串或列表
|
||||||
|
- kb_ids: 知识库ID列表
|
||||||
|
- page: 当前页码
|
||||||
|
- page_size: 每页结果数量
|
||||||
|
- similarity_threshold: 相似度阈值,低于此值的结果将被过滤
|
||||||
|
- vector_similarity_weight: 向量相似度权重
|
||||||
|
- top: 检索的最大结果数
|
||||||
|
- doc_ids: 文档ID列表,用于限制检索范围
|
||||||
|
- aggs: 是否聚合文档信息
|
||||||
|
- rerank_mdl: 重排序模型
|
||||||
|
- highlight: 是否高亮匹配内容
|
||||||
|
- rank_feature: 排序特征,如PageRank值
|
||||||
|
|
||||||
|
返回:
|
||||||
|
包含检索结果的字典,包括总数、文档片段和文档聚合信息
|
||||||
|
"""
|
||||||
|
# 初始化结果字典
|
||||||
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
|
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
|
||||||
if not question:
|
if not question:
|
||||||
return ranks
|
return ranks
|
||||||
|
# 设置重排序页面限制
|
||||||
RERANK_PAGE_LIMIT = 3
|
RERANK_PAGE_LIMIT = 3
|
||||||
|
# 构建检索请求参数
|
||||||
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": max(page_size * RERANK_PAGE_LIMIT, 128),
|
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": max(page_size * RERANK_PAGE_LIMIT, 128),
|
||||||
"question": question, "vector": True, "topk": top,
|
"question": question, "vector": True, "topk": top,
|
||||||
"similarity": similarity_threshold,
|
"similarity": similarity_threshold,
|
||||||
"available_int": 1}
|
"available_int": 1}
|
||||||
|
|
||||||
|
# 如果页码超过重排序限制,直接请求指定页的数据
|
||||||
if page > RERANK_PAGE_LIMIT:
|
if page > RERANK_PAGE_LIMIT:
|
||||||
req["page"] = page
|
req["page"] = page
|
||||||
req["size"] = page_size
|
req["size"] = page_size
|
||||||
|
|
||||||
|
# 处理租户ID格式
|
||||||
if isinstance(tenant_ids, str):
|
if isinstance(tenant_ids, str):
|
||||||
tenant_ids = tenant_ids.split(",")
|
tenant_ids = tenant_ids.split(",")
|
||||||
|
|
||||||
|
# 执行搜索操作
|
||||||
sres = self.search(req, [index_name(tid) for tid in tenant_ids],
|
sres = self.search(req, [index_name(tid) for tid in tenant_ids],
|
||||||
kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
|
kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
|
||||||
ranks["total"] = sres.total
|
ranks["total"] = sres.total
|
||||||
|
|
||||||
|
# 根据页码决定是否需要重排序
|
||||||
if page <= RERANK_PAGE_LIMIT:
|
if page <= RERANK_PAGE_LIMIT:
|
||||||
|
# 前几页需要重排序以提高结果质量
|
||||||
if rerank_mdl and sres.total > 0:
|
if rerank_mdl and sres.total > 0:
|
||||||
|
# 使用重排序模型进行重排序
|
||||||
sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
|
sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
|
||||||
sres, question, 1 - vector_similarity_weight,
|
sres, question, 1 - vector_similarity_weight,
|
||||||
vector_similarity_weight,
|
vector_similarity_weight,
|
||||||
rank_feature=rank_feature)
|
rank_feature=rank_feature)
|
||||||
else:
|
else:
|
||||||
|
# 使用默认方法进行重排序
|
||||||
sim, tsim, vsim = self.rerank(
|
sim, tsim, vsim = self.rerank(
|
||||||
sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
|
sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
|
||||||
rank_feature=rank_feature)
|
rank_feature=rank_feature)
|
||||||
|
# 根据相似度降序排序,并选择当前页的结果
|
||||||
idx = np.argsort(sim * -1)[(page - 1) * page_size:page * page_size]
|
idx = np.argsort(sim * -1)[(page - 1) * page_size:page * page_size]
|
||||||
else:
|
else:
|
||||||
|
# 后续页面不需要重排序,直接使用搜索结果
|
||||||
sim = tsim = vsim = [1] * len(sres.ids)
|
sim = tsim = vsim = [1] * len(sres.ids)
|
||||||
idx = list(range(len(sres.ids)))
|
idx = list(range(len(sres.ids)))
|
||||||
|
|
||||||
|
# 获取向量维度和列名
|
||||||
dim = len(sres.query_vector)
|
dim = len(sres.query_vector)
|
||||||
vector_column = f"q_{dim}_vec"
|
vector_column = f"q_{dim}_vec"
|
||||||
zero_vector = [0.0] * dim
|
zero_vector = [0.0] * dim
|
||||||
|
|
||||||
|
# 处理每个检索结果
|
||||||
for i in idx:
|
for i in idx:
|
||||||
|
# 过滤低于阈值的结果
|
||||||
if sim[i] < similarity_threshold:
|
if sim[i] < similarity_threshold:
|
||||||
break
|
break
|
||||||
|
# 控制返回结果数量
|
||||||
if len(ranks["chunks"]) >= page_size:
|
if len(ranks["chunks"]) >= page_size:
|
||||||
if aggs:
|
if aggs:
|
||||||
continue
|
continue
|
||||||
|
@ -391,6 +468,7 @@ class Dealer:
|
||||||
dnm = chunk.get("docnm_kwd", "")
|
dnm = chunk.get("docnm_kwd", "")
|
||||||
did = chunk.get("doc_id", "")
|
did = chunk.get("doc_id", "")
|
||||||
position_int = chunk.get("position_int", [])
|
position_int = chunk.get("position_int", [])
|
||||||
|
# 构建结果字典
|
||||||
d = {
|
d = {
|
||||||
"chunk_id": id,
|
"chunk_id": id,
|
||||||
"content_ltks": chunk["content_ltks"],
|
"content_ltks": chunk["content_ltks"],
|
||||||
|
@ -406,6 +484,8 @@ class Dealer:
|
||||||
"vector": chunk.get(vector_column, zero_vector),
|
"vector": chunk.get(vector_column, zero_vector),
|
||||||
"positions": position_int,
|
"positions": position_int,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 处理高亮内容
|
||||||
if highlight and sres.highlight:
|
if highlight and sres.highlight:
|
||||||
if id in sres.highlight:
|
if id in sres.highlight:
|
||||||
d["highlight"] = rmSpace(sres.highlight[id])
|
d["highlight"] = rmSpace(sres.highlight[id])
|
||||||
|
@ -415,6 +495,7 @@ class Dealer:
|
||||||
if dnm not in ranks["doc_aggs"]:
|
if dnm not in ranks["doc_aggs"]:
|
||||||
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
||||||
ranks["doc_aggs"][dnm]["count"] += 1
|
ranks["doc_aggs"][dnm]["count"] += 1
|
||||||
|
# 将文档聚合信息转换为列表格式,并按计数降序排序
|
||||||
ranks["doc_aggs"] = [{"doc_name": k,
|
ranks["doc_aggs"] = [{"doc_name": k,
|
||||||
"doc_id": v["doc_id"],
|
"doc_id": v["doc_id"],
|
||||||
"count": v["count"]} for k,
|
"count": v["count"]} for k,
|
||||||
|
|
|
@ -149,6 +149,14 @@ class Dealer:
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def split(self, txt):
|
def split(self, txt):
|
||||||
|
"""
|
||||||
|
特殊分词方法,主要处理连续英文单词的合并
|
||||||
|
参数:
|
||||||
|
txt: 待分词的文本字符串
|
||||||
|
|
||||||
|
返回:
|
||||||
|
处理后的词条列表
|
||||||
|
"""
|
||||||
tks = []
|
tks = []
|
||||||
for t in re.sub(r"[ \t]+", " ", txt).split():
|
for t in re.sub(r"[ \t]+", " ", txt).split():
|
||||||
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
|
||||||
|
|
|
@ -55,7 +55,18 @@ def llm_id2llm_type(llm_id):
|
||||||
|
|
||||||
|
|
||||||
def message_fit_in(msg, max_length=4000):
|
def message_fit_in(msg, max_length=4000):
|
||||||
|
"""
|
||||||
|
调整消息列表使其token总数不超过max_length限制
|
||||||
|
|
||||||
|
参数:
|
||||||
|
msg: 消息列表,每个元素为包含role和content的字典
|
||||||
|
max_length: 最大token数限制,默认4000
|
||||||
|
|
||||||
|
返回:
|
||||||
|
tuple: (实际token数, 调整后的消息列表)
|
||||||
|
"""
|
||||||
def count():
|
def count():
|
||||||
|
"""计算当前消息列表的总token数"""
|
||||||
nonlocal msg
|
nonlocal msg
|
||||||
tks_cnts = []
|
tks_cnts = []
|
||||||
for m in msg:
|
for m in msg:
|
||||||
|
@ -67,9 +78,11 @@ def message_fit_in(msg, max_length=4000):
|
||||||
return total
|
return total
|
||||||
|
|
||||||
c = count()
|
c = count()
|
||||||
|
# 如果不超限制,直接返回
|
||||||
if c < max_length:
|
if c < max_length:
|
||||||
return c, msg
|
return c, msg
|
||||||
|
|
||||||
|
# 第一次精简:保留系统消息和最后一条消息
|
||||||
msg_ = [m for m in msg if m["role"] == "system"]
|
msg_ = [m for m in msg if m["role"] == "system"]
|
||||||
if len(msg) > 1:
|
if len(msg) > 1:
|
||||||
msg_.append(msg[-1])
|
msg_.append(msg[-1])
|
||||||
|
@ -77,15 +90,18 @@ def message_fit_in(msg, max_length=4000):
|
||||||
c = count()
|
c = count()
|
||||||
if c < max_length:
|
if c < max_length:
|
||||||
return c, msg
|
return c, msg
|
||||||
|
|
||||||
|
# 计算系统消息和最后一条消息的token数
|
||||||
ll = num_tokens_from_string(msg_[0]["content"])
|
ll = num_tokens_from_string(msg_[0]["content"])
|
||||||
ll2 = num_tokens_from_string(msg_[-1]["content"])
|
ll2 = num_tokens_from_string(msg_[-1]["content"])
|
||||||
|
# 如果系统消息占比超过80%,则截断系统消息
|
||||||
if ll / (ll + ll2) > 0.8:
|
if ll / (ll + ll2) > 0.8:
|
||||||
m = msg_[0]["content"]
|
m = msg_[0]["content"]
|
||||||
m = encoder.decode(encoder.encode(m)[:max_length - ll2])
|
m = encoder.decode(encoder.encode(m)[:max_length - ll2])
|
||||||
msg[0]["content"] = m
|
msg[0]["content"] = m
|
||||||
return max_length, msg
|
return max_length, msg
|
||||||
|
|
||||||
|
# 否则截断最后一条消息
|
||||||
m = msg_[-1]["content"]
|
m = msg_[-1]["content"]
|
||||||
m = encoder.decode(encoder.encode(m)[:max_length - ll2])
|
m = encoder.decode(encoder.encode(m)[:max_length - ll2])
|
||||||
msg[-1]["content"] = m
|
msg[-1]["content"] = m
|
||||||
|
@ -93,6 +109,23 @@ def message_fit_in(msg, max_length=4000):
|
||||||
|
|
||||||
|
|
||||||
def kb_prompt(kbinfos, max_tokens):
|
def kb_prompt(kbinfos, max_tokens):
|
||||||
|
"""
|
||||||
|
将检索到的知识库内容格式化为适合大语言模型的提示词
|
||||||
|
|
||||||
|
参数:
|
||||||
|
kbinfos (dict): 检索结果,包含chunks等信息
|
||||||
|
max_tokens (int): 模型的最大token限制
|
||||||
|
|
||||||
|
流程:
|
||||||
|
1. 提取所有检索到的文档片段内容
|
||||||
|
2. 计算token数量,确保不超过模型限制
|
||||||
|
3. 获取文档元数据
|
||||||
|
4. 按文档名组织文档片段
|
||||||
|
5. 格式化为结构化提示词
|
||||||
|
|
||||||
|
返回:
|
||||||
|
list: 格式化后的知识库内容列表,每个元素是一个文档的相关信息
|
||||||
|
"""
|
||||||
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
||||||
used_token_count = 0
|
used_token_count = 0
|
||||||
chunks_num = 0
|
chunks_num = 0
|
||||||
|
@ -126,58 +159,56 @@ def kb_prompt(kbinfos, max_tokens):
|
||||||
|
|
||||||
def citation_prompt():
|
def citation_prompt():
|
||||||
return """
|
return """
|
||||||
|
# 引用要求:
|
||||||
|
- 以格式 '##i$$ ##j$$'插入引用,其中 i, j 是所引用内容的 ID,并用 '##' 和 '$$' 包裹。
|
||||||
|
- 在句子末尾插入引用,每个句子最多 4 个引用。
|
||||||
|
- 如果答案内容不来自检索到的文本块,则不要插入引用。
|
||||||
|
|
||||||
# Citation requirements:
|
--- 示例 ---
|
||||||
- Inserts CITATIONS in format '##i$$ ##j$$' where i,j are the ID of the content you are citing and encapsulated with '##' and '$$'.
|
<SYSTEM>: 以下是知识库:
|
||||||
- Inserts the CITATION symbols at the end of a sentence, AND NO MORE than 4 citations.
|
|
||||||
- DO NOT insert CITATION in the answer if the content is not from retrieved chunks.
|
|
||||||
|
|
||||||
--- Example START ---
|
Document: 埃隆·马斯克打破沉默谈加密货币,警告不要全仓狗狗币 ...
|
||||||
<SYSTEM>: Here is the knowledge base:
|
|
||||||
|
|
||||||
Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ...
|
|
||||||
URL: https://blockworks.co/news/elon-musk-crypto-dogecoin
|
URL: https://blockworks.co/news/elon-musk-crypto-dogecoin
|
||||||
ID: 0
|
ID: 0
|
||||||
The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said it’s still his favorite crypto...
|
特斯拉联合创始人建议不要全仓投入 Dogecoin,但埃隆·马斯克表示它仍然是他最喜欢的加密货币...
|
||||||
|
|
||||||
Document: Elon Musk's Dogecoin tweet sparks social media frenzy
|
Document: 埃隆·马斯克关于狗狗币的推文引发社交媒体狂热
|
||||||
ID: 1
|
ID: 1
|
||||||
Musk said he is 'willing to serve' D.O.G.E. – shorthand for Dogecoin.
|
马斯克表示他“愿意服务”D.O.G.E.——即 Dogecoin 的缩写。
|
||||||
|
|
||||||
Document: Causal effect of Elon Musk tweets on Dogecoin price
|
Document: 埃隆·马斯克推文对狗狗币价格的因果影响
|
||||||
ID: 2
|
ID: 2
|
||||||
If you think of Dogecoin — the cryptocurrency based on a meme — you can’t help but also think of Elon Musk...
|
如果你想到 Dogecoin——这个基于表情包的加密货币,你就无法不想到埃隆·马斯克...
|
||||||
|
|
||||||
Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services
|
Document: 埃隆·马斯克推文点燃狗狗币在公共服务领域的未来前景
|
||||||
ID: 3
|
ID: 3
|
||||||
The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?...
|
在埃隆·马斯克关于 Dogecoin 的公告后,市场正在升温。这是否意味着加密货币的新纪元?...
|
||||||
|
|
||||||
The above is the knowledge base.
|
以上是知识库。
|
||||||
|
|
||||||
<USER>: What's the Elon's view on dogecoin?
|
<USER>: 埃隆·马斯克对 Dogecoin 的看法是什么?
|
||||||
|
|
||||||
<ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency ##0$$ ##1$$.
|
<ASSISTANT>: 马斯克一贯表达了对 Dogecoin 的喜爱,常常提及其幽默感和品牌中狗的元素。他曾表示这是他最喜欢的加密货币 ##0 ##1。
|
||||||
Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services ##3$$.
|
最近,马斯克暗示 Dogecoin 未来可能会有新的应用场景。他的推文引发了关于 Dogecoin 可能被整合到公共服务中的猜测 ##3$$。
|
||||||
Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature.
|
总体而言,虽然马斯克喜欢 Dogecoin 并经常推广它,但他也警告不要过度投资,反映了他对其投机性质的既喜爱又谨慎的态度。
|
||||||
|
|
||||||
--- Example END ---
|
--- 示例结束 ---
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def keyword_extraction(chat_mdl, content, topn=3):
|
def keyword_extraction(chat_mdl, content, topn=3):
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
Role: You're a text analyzer.
|
角色:文本分析器
|
||||||
Task: extract the most important keywords/phrases of a given piece of text content.
|
任务:提取给定文本内容中最重要的关键词/短语
|
||||||
Requirements:
|
要求:
|
||||||
- Summarize the text content, and give top {topn} important keywords/phrases.
|
- 总结文本内容,给出前{topn}个重要关键词/短语
|
||||||
- The keywords MUST be in language of the given piece of text content.
|
- 关键词必须使用原文语言
|
||||||
- The keywords are delimited by ENGLISH COMMA.
|
- 关键词之间用英文逗号分隔
|
||||||
- Keywords ONLY in output.
|
- 仅输出关键词
|
||||||
|
|
||||||
### Text Content
|
### 文本内容
|
||||||
{content}
|
{content}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
msg = [
|
msg = [
|
||||||
{"role": "system", "content": prompt},
|
{"role": "system", "content": prompt},
|
||||||
|
|
351
requirements.txt
351
requirements.txt
|
@ -1,250 +1,115 @@
|
||||||
academicagent==0.1.2
|
|
||||||
accelerate==1.5.2
|
|
||||||
aiohappyeyeballs==2.5.0
|
|
||||||
aiohttp==3.11.13
|
|
||||||
aiosignal==1.3.2
|
|
||||||
annotated-types==0.7.0
|
|
||||||
anyio==4.8.0
|
|
||||||
async-timeout==4.0.3
|
|
||||||
attrs==25.1.0
|
|
||||||
backoff==2.2.1
|
|
||||||
backports.tarfile==1.2.0
|
|
||||||
backtrader==1.9.78.123
|
|
||||||
beartype==0.20.0
|
|
||||||
beautifulsoup4==4.13.3
|
|
||||||
bs4==0.0.2
|
|
||||||
cachetools==5.5.2
|
|
||||||
cbor==1.0.0
|
|
||||||
certifi==2025.1.31
|
|
||||||
cffi==1.17.1
|
|
||||||
chardet==5.2.0
|
|
||||||
charset-normalizer==3.4.1
|
|
||||||
click==8.1.8
|
|
||||||
cn2an==0.5.23
|
|
||||||
cnki-agent==0.1.2
|
|
||||||
CnkiSpider==1.1.0
|
|
||||||
colorama==0.4.6
|
|
||||||
coloredlogs==15.0.1
|
|
||||||
colpali_engine==0.3.8
|
|
||||||
contourpy==1.3.1
|
|
||||||
cramjam==2.9.1
|
|
||||||
cryptography==44.0.2
|
|
||||||
csscompressor==0.9.5
|
|
||||||
cssselect==1.3.0
|
|
||||||
cssutils==2.11.1
|
|
||||||
ctranslate2==4.5.0
|
|
||||||
cycler==0.12.1
|
|
||||||
dashscope==1.22.2
|
|
||||||
dataclasses-json==0.6.7
|
|
||||||
DataRecorder==3.6.2
|
|
||||||
datasets==3.4.0
|
|
||||||
datrie==0.8.2
|
datrie==0.8.2
|
||||||
dill==0.3.8
|
akshare>=1.15.78,<2.0.0
|
||||||
diskcache==5.6.3
|
azure-storage-blob==12.22.0
|
||||||
distro==1.9.0
|
azure-identity==1.17.1
|
||||||
docutils==0.21.2
|
azure-storage-file-datalake==12.16.0
|
||||||
DownloadKit==2.0.7
|
anthropic==0.34.1
|
||||||
DrissionPage==4.1.0.17
|
arxiv==2.1.3
|
||||||
einops==0.8.1
|
aspose-slides>=24.9.0,<25.0.0; platform_machine == 'x86_64' or (sys_platform == 'darwin' and platform_machine == 'arm64')
|
||||||
elastic-transport==8.17.1
|
beartype>=0.18.5,<0.19.0
|
||||||
elasticsearch==8.17.2
|
bio==1.7.1
|
||||||
elasticsearch-dsl==8.17.1
|
blinker==1.7.0
|
||||||
et_xmlfile==2.0.0
|
boto3==1.34.140
|
||||||
evaluate==0.4.3
|
botocore==1.34.140
|
||||||
exceptiongroup==1.2.2
|
cachetools==5.3.3
|
||||||
fastapi==0.115.11
|
chardet==5.2.0
|
||||||
fastparquet==2024.11.0
|
cn2an==0.5.22
|
||||||
filelock==3.17.0
|
cohere==5.6.2
|
||||||
FlagEmbedding==1.3.4
|
Crawl4AI==0.3.8
|
||||||
flatbuffers==25.2.10
|
dashscope==1.20.11
|
||||||
fonttools==4.56.0
|
deepl==1.18.0
|
||||||
frozenlist==1.5.0
|
demjson3==3.0.6
|
||||||
fsspec==2024.12.0
|
discord-py==2.3.2
|
||||||
google-ai-generativelanguage==0.6.15
|
duckduckgo-search>=7.2.0,<8.0.0
|
||||||
google-api-core==2.24.2
|
editdistance==0.8.1
|
||||||
google-api-python-client==2.164.0
|
elastic-transport==8.12.0
|
||||||
google-auth==2.38.0
|
elasticsearch==8.12.1
|
||||||
google-auth-httplib2==0.2.0
|
elasticsearch-dsl==8.12.0
|
||||||
google-generativeai==0.8.4
|
filelock==3.15.4
|
||||||
googleapis-common-protos==1.69.1
|
flask==3.0.3
|
||||||
GPUtil==1.4.0
|
flask-cors==5.0.0
|
||||||
greenlet==3.1.1
|
flask-login==0.6.3
|
||||||
grpcio==1.71.0
|
flask-session==0.8.0
|
||||||
grpcio-status==1.71.0
|
google-search-results==2.4.2
|
||||||
h11==0.14.0
|
groq==0.9.0
|
||||||
hanziconv==0.3.2
|
hanziconv==0.3.2
|
||||||
hf_transfer==0.1.9
|
html-text==0.6.2
|
||||||
html-minifier==0.0.4
|
httpx==0.27.0
|
||||||
httpcore==1.0.7
|
huggingface-hub>=0.25.0,<0.26.0
|
||||||
httplib2==0.22.0
|
infinity-sdk==0.6.0-dev3
|
||||||
httptools==0.6.4
|
infinity-emb>=0.0.66,<0.0.67
|
||||||
httpx==0.28.1
|
itsdangerous==2.1.2
|
||||||
httpx-sse==0.4.0
|
json-repair==0.35.0
|
||||||
huggingface-hub==0.29.3
|
markdown==3.6
|
||||||
humanfriendly==10.0
|
markdown-to-json==2.1.1
|
||||||
id==1.5.0
|
minio==7.2.4
|
||||||
idna==3.10
|
mistralai==0.4.2
|
||||||
ijson==3.3.0
|
|
||||||
importlib_metadata==8.6.1
|
|
||||||
infinity-sdk==0.6.0.dev3
|
|
||||||
infinity_emb==0.0.75
|
|
||||||
iniconfig==2.0.0
|
|
||||||
inscriptis==2.5.3
|
|
||||||
ir_datasets==0.5.10
|
|
||||||
jaraco.classes==3.4.0
|
|
||||||
jaraco.context==6.0.1
|
|
||||||
jaraco.functools==4.1.0
|
|
||||||
Jinja2==3.1.6
|
|
||||||
jiter==0.9.0
|
|
||||||
joblib==1.4.2
|
|
||||||
jsmin==3.0.1
|
|
||||||
json_repair==0.39.1
|
|
||||||
jsonpatch==1.33
|
|
||||||
jsonpointer==3.0.0
|
|
||||||
keyring==25.6.0
|
|
||||||
kiwisolver==1.4.8
|
|
||||||
langchain==0.3.20
|
|
||||||
langchain-community==0.3.19
|
|
||||||
langchain-core==0.3.41
|
|
||||||
langchain-ollama==0.2.3
|
|
||||||
langchain-text-splitters==0.3.6
|
|
||||||
langsmith==0.3.12
|
|
||||||
lxml==5.3.1
|
|
||||||
lz4==4.4.3
|
|
||||||
markdown-it-py==3.0.0
|
|
||||||
MarkupSafe==3.0.2
|
|
||||||
marshmallow==3.26.1
|
|
||||||
matplotlib==3.10.0
|
|
||||||
mdurl==0.1.2
|
|
||||||
monotonic==1.6
|
|
||||||
more-itertools==10.6.0
|
|
||||||
mpmath==1.3.0
|
|
||||||
multidict==6.1.0
|
|
||||||
multiprocess==0.70.16
|
|
||||||
mypy-extensions==1.0.0
|
|
||||||
mysql==0.0.3
|
|
||||||
mysql-connector-python==9.2.0
|
|
||||||
mysqlclient==2.2.7
|
|
||||||
networkx==3.4.2
|
|
||||||
nh3==0.2.21
|
|
||||||
nltk==3.9.1
|
nltk==3.9.1
|
||||||
numpy==1.26.4
|
numpy>=1.26.0,<2.0.0
|
||||||
ollama==0.4.7
|
ollama==0.2.1
|
||||||
onnx==1.17.0
|
onnxruntime==1.19.2; sys_platform == 'darwin' or platform_machine != 'x86_64'
|
||||||
onnxruntime==1.21.0
|
onnxruntime-gpu==1.19.2; sys_platform != 'darwin' and platform_machine == 'x86_64'
|
||||||
openai==1.66.3
|
openai==1.45.0
|
||||||
openpyxl==3.1.5
|
opencv-python==4.10.0.84
|
||||||
optimum==1.24.0
|
opencv-python-headless==4.10.0.84
|
||||||
orjson==3.10.15
|
openpyxl>=3.1.0,<4.0.0
|
||||||
ormsgpack==1.8.0
|
ormsgpack==1.5.0
|
||||||
outcome==1.3.0.post0
|
pandas>=2.2.0,<3.0.0
|
||||||
packaging==24.2
|
pdfplumber==0.10.4
|
||||||
pandas==2.2.3
|
peewee==3.17.1
|
||||||
pdfminer.six==20231228
|
pillow==10.4.0
|
||||||
pdfplumber==0.11.5
|
protobuf==5.27.2
|
||||||
peft==0.14.0
|
psycopg2-binary==2.9.9
|
||||||
pillow==11.1.0
|
pyclipper==1.3.0.post5
|
||||||
pluggy==1.5.0
|
|
||||||
polars-lts-cpu==1.9.0
|
|
||||||
posthog==3.20.0
|
|
||||||
proces==0.1.7
|
|
||||||
prometheus-fastapi-instrumentator==7.0.2
|
|
||||||
prometheus_client==0.21.1
|
|
||||||
propcache==0.3.0
|
|
||||||
proto-plus==1.26.1
|
|
||||||
protobuf==5.29.3
|
|
||||||
psutil==7.0.0
|
|
||||||
pyarrow==17.0.0
|
|
||||||
pyasn1==0.6.1
|
|
||||||
pyasn1_modules==0.4.1
|
|
||||||
pycparser==2.22
|
|
||||||
pycryptodome==3.21.0
|
|
||||||
pycryptodomex==3.20.0
|
pycryptodomex==3.20.0
|
||||||
pydantic==2.9.2
|
pypdf>=5.0.0,<6.0.0
|
||||||
pydantic-settings==2.8.1
|
pytest>=8.3.0,<9.0.0
|
||||||
pydantic_core==2.23.4
|
|
||||||
Pygments==2.19.1
|
|
||||||
PyJWT==2.8.0
|
|
||||||
PyMuPDF==1.25.3
|
|
||||||
PyMySQL==1.1.1
|
|
||||||
pyparsing==3.2.1
|
|
||||||
pypdfium2==4.30.1
|
|
||||||
pyreadline3==3.5.4
|
|
||||||
PySocks==1.7.1
|
|
||||||
pytest==8.3.5
|
|
||||||
python-dateutil==2.9.0.post0
|
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
pytz==2025.1
|
python-dateutil==2.8.2
|
||||||
pywin32-ctypes==0.2.3
|
python-pptx>=1.0.2,<2.0.0
|
||||||
PyYAML==6.0.2
|
pywencai==0.12.2
|
||||||
readerwriterlock==1.0.9
|
qianfan==0.4.6
|
||||||
readme_renderer==44.0
|
ranx==0.3.20
|
||||||
regex==2024.11.6
|
readability-lxml==0.8.1
|
||||||
requests==2.32.3
|
valkey==6.0.2
|
||||||
requests-file==2.1.0
|
requests==2.32.2
|
||||||
requests-toolbelt==1.0.0
|
replicate==0.31.0
|
||||||
rfc3986==2.0.0
|
|
||||||
rich==13.9.4
|
|
||||||
roman-numbers==1.0.2
|
roman-numbers==1.0.2
|
||||||
rsa==4.9
|
ruamel-base==1.0.0
|
||||||
ruamel.yaml==0.18.10
|
scholarly==1.7.11
|
||||||
ruamel.yaml.clib==0.2.12
|
scikit-learn==1.5.0
|
||||||
safetensors==0.5.3
|
selenium==4.22.0
|
||||||
scikit-learn==1.6.1
|
selenium-wire==5.1.0
|
||||||
scipy==1.15.2
|
setuptools>=75.2.0,<76.0.0
|
||||||
selenium==4.29.0
|
shapely==2.0.5
|
||||||
sentence-transformers==3.4.1
|
six==1.16.0
|
||||||
sentencepiece==0.2.0
|
strenum==0.4.15
|
||||||
shellingham==1.5.4
|
tabulate==0.9.0
|
||||||
simplejson==3.20.1
|
tavily-python==0.5.1
|
||||||
six==1.17.0
|
tencentcloud-sdk-python==3.0.1215
|
||||||
sniffio==1.3.1
|
tika==2.6.0
|
||||||
sortedcontainers==2.4.0
|
tiktoken==0.7.0
|
||||||
soundfile==0.12.1
|
umap_learn==0.5.6
|
||||||
soupsieve==2.6
|
vertexai==1.64.0
|
||||||
SQLAlchemy==2.0.38
|
volcengine==1.0.146
|
||||||
sqlglot==11.7.1
|
voyageai==0.2.3
|
||||||
starlette==0.46.1
|
webdriver-manager==4.0.1
|
||||||
StrEnum==0.4.15
|
werkzeug==3.0.6
|
||||||
sympy==1.13.1
|
wikipedia==1.4.0
|
||||||
tenacity==9.0.0
|
|
||||||
threadpoolctl==3.6.0
|
|
||||||
thrift==0.20.0
|
|
||||||
tiktoken==0.9.0
|
|
||||||
timm==1.0.15
|
|
||||||
tldextract==5.1.3
|
|
||||||
tokenizers==0.21.1
|
|
||||||
tomli==2.2.1
|
|
||||||
torch==2.6.0
|
|
||||||
torchvision==0.21.0
|
|
||||||
tqdm==4.67.1
|
|
||||||
transformers==4.47.1
|
|
||||||
trec-car-tools==2.6
|
|
||||||
trio==0.29.0
|
|
||||||
trio-websocket==0.12.2
|
|
||||||
tushare==1.4.18
|
|
||||||
twine==6.1.0
|
|
||||||
typer==0.12.5
|
|
||||||
typing-inspect==0.9.0
|
|
||||||
typing_extensions==4.12.2
|
|
||||||
tzdata==2025.1
|
|
||||||
unlzw3==0.2.3
|
|
||||||
uritemplate==4.1.1
|
|
||||||
urllib3==2.3.0
|
|
||||||
uvicorn==0.32.1
|
|
||||||
valkey==6.1.0
|
|
||||||
warc3-wet==0.2.5
|
|
||||||
warc3-wet-clueweb09==0.2.5
|
|
||||||
watchfiles==1.0.4
|
|
||||||
webdriver-manager==4.0.2
|
|
||||||
websocket-client==1.8.0
|
|
||||||
websockets==15.0.1
|
|
||||||
Werkzeug==3.1.3
|
|
||||||
word2number==1.1
|
word2number==1.1
|
||||||
wsproto==1.2.0
|
xgboost==1.5.0
|
||||||
xxhash==3.5.0
|
xpinyin==0.7.6
|
||||||
yarl==1.18.3
|
yfinance==0.1.96
|
||||||
zhipuai==2.1.5.20250106
|
zhipuai==2.0.1
|
||||||
zipp==3.21.0
|
ruamel-yaml>=0.18.6,<0.19.0
|
||||||
zlib-state==0.1.9
|
google-generativeai>=0.8.1,<0.9.0
|
||||||
zstandard==0.23.0
|
python-docx>=1.1.2,<2.0.0
|
||||||
|
pypdf2>=3.0.1,<4.0.0
|
||||||
|
graspologic>=3.4.1,<4.0.0
|
||||||
|
pymysql>=1.1.1,<2.0.0
|
||||||
|
mini-racer>=0.12.4,<0.13.0
|
||||||
|
pyodbc>=5.2.0,<6.0.0
|
||||||
|
pyicu>=2.13.1,<3.0.0
|
||||||
|
flasgger>=0.9.7.1,<0.10.0
|
||||||
|
xxhash>=3.5.0,<4.0.0
|
||||||
|
trio>=0.29.0
|
|
@ -95,7 +95,7 @@ const AssistantSetting = ({
|
||||||
<Form.Item
|
<Form.Item
|
||||||
name={'language'}
|
name={'language'}
|
||||||
label={t('language')}
|
label={t('language')}
|
||||||
initialValue={'English'}
|
initialValue={'Chinese'}
|
||||||
tooltip="coming soon"
|
tooltip="coming soon"
|
||||||
style={{ display: 'none' }}
|
style={{ display: 'none' }}
|
||||||
>
|
>
|
||||||
|
|
Loading…
Reference in New Issue