From a51b3168a099c1a09e29483157e07cb697f417d0 Mon Sep 17 00:00:00 2001
From: zstar <65890619+zstar1003@users.noreply.github.com>
Date: Thu, 3 Apr 2025 21:00:49 +0800
Subject: [PATCH] Merge pull request #11 from zstar1003/dev
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

refactor: 增加了一些辅助理解的注释
---
 api/db/services/dialog_service.py             |  42 ++-
 rag/nlp/query.py                              |  65 +++-
 rag/nlp/search.py                             |  95 ++++-
 rag/nlp/term_weight.py                        |   8 +
 rag/prompts.py                                |  97 +++--
 requirements.txt                              | 351 ++++++------------
 .../assistant-setting.tsx                     |   2 +-
 7 files changed, 356 insertions(+), 304 deletions(-)
diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py
index e7d7118..d8500a1 100644
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@@ -239,6 +239,7 @@ def chat(dialog, messages, stream=True, **kwargs):
     prompt4citation = ""
     if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
         prompt4citation = citation_prompt()
+    # 过滤掉 system 角色的消息(因为前面已经单独处理了系统消息)
     msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])}
                 for m in messages if m["role"] != "system"])
     used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.95))
@@ -309,16 +310,20 @@ def chat(dialog, messages, stream=True, **kwargs):
         return {"answer": think+answer, "reference": refs, "prompt": re.sub(r"\n", "  \n", prompt), "created_at": time.time()}
 
     if stream:
-        last_ans = ""
-        answer = ""
+        last_ans = "" # 记录上一次返回的完整回答
+        answer = "" # 当前累计的完整回答
         for ans in chat_mdl.chat_streamly(prompt+prompt4citation, msg[1:], gen_conf):
+            # 如果存在思考过程(thought)，移除相关标记
             if thought:
                 ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)
             answer = ans
+            # 计算新增的文本片段(delta)
             delta_ans = ans[len(last_ans):]
+            # 如果新增token太少(小于16)，跳过本次返回(避免频繁发送小片段)
             if num_tokens_from_string(delta_ans) < 16:
                 continue
             last_ans = answer
+            # 返回当前累计回答(包含思考过程)+新增片段)
             yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)}
         delta_ans = answer[len(last_ans):]
         if delta_ans:
@@ -471,20 +476,48 @@ def tts(tts_mdl, text):
 
 
 def ask(question, kb_ids, tenant_id):
+    """
+    处理用户搜索请求，从知识库中检索相关信息并生成回答
+    
+    参数:
+        question (str): 用户的问题或查询
+        kb_ids (list): 知识库ID列表，指定要搜索的知识库
+        tenant_id (str): 租户ID，用于权限控制和资源隔离
+        
+    流程:
+        1. 获取指定知识库的信息
+        2. 确定使用的嵌入模型
+        3. 根据知识库类型选择检索器(普通检索器或知识图谱检索器)
+        4. 初始化嵌入模型和聊天模型
+        5. 执行检索操作获取相关文档片段
+        6. 格式化知识库内容作为上下文
+        7. 构建系统提示词
+        8. 生成回答并添加引用标记
+        9. 流式返回生成的回答
+        
+    返回:
+        generator: 生成器对象，产生包含回答和引用信息的字典
+    """
+    
     kbs = KnowledgebaseService.get_by_ids(kb_ids)
     embedding_list = list(set([kb.embd_id for kb in kbs]))
 
     is_knowledge_graph = all([kb.parser_id == ParserType.KG for kb in kbs])
     retriever = settings.retrievaler if not is_knowledge_graph else settings.kg_retrievaler
-
+    # 初始化嵌入模型，用于将文本转换为向量表示
     embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING, embedding_list[0])
+    # 初始化聊天模型，用于生成回答
     chat_mdl = LLMBundle(tenant_id, LLMType.CHAT)
+    # 获取聊天模型的最大token长度，用于控制上下文长度
     max_tokens = chat_mdl.max_length
+    # 获取所有知识库的租户ID并去重
     tenant_ids = list(set([kb.tenant_id for kb in kbs]))
+     # 调用检索器检索相关文档片段
     kbinfos = retriever.retrieval(question, embd_mdl, tenant_ids, kb_ids,
                                   1, 12, 0.1, 0.3, aggs=False,
                                   rank_feature=label_question(question, kbs)
                                   )
+    # 将检索结果格式化为提示词，并确保不超过模型最大token限制   
     knowledges = kb_prompt(kbinfos, max_tokens)
     prompt = """
     Role: You're a smart assistant. Your name is Miss R.
@@ -504,6 +537,7 @@ def ask(question, kb_ids, tenant_id):
     """ % "\n".join(knowledges)
     msg = [{"role": "user", "content": question}]
 
+    # 生成完成后添加回答中的引用标记
     def decorate_answer(answer):
         nonlocal knowledges, kbinfos, prompt
         answer, idx = retriever.insert_citations(answer,
@@ -534,4 +568,4 @@ def ask(question, kb_ids, tenant_id):
     for ans in chat_mdl.chat_streamly(prompt, msg, {"temperature": 0.1}):
         answer = ans
         yield {"answer": answer, "reference": {}}
-    yield decorate_answer(answer)
+    yield decorate_answer(answer)
\ No newline at end of file
diff --git a/rag/nlp/query.py b/rag/nlp/query.py
index b58efb8..8e1aca3 100644
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@@ -71,13 +71,27 @@ class FulltextQueryer:
         return txt
 
     def question(self, txt, tbl="qa", min_match: float = 0.6):
+        """
+        处理用户问题并生成全文检索表达式
+        
+        参数:
+            txt: 原始问题文本
+            tbl: 查询表名(默认"qa")
+            min_match: 最小匹配阈值(默认0.6)
+            
+        返回:
+            MatchTextExpr: 全文检索表达式对象
+            list: 提取的关键词列表
+        """
+        # 1. 文本预处理：去除特殊字符、繁体转简体、全角转半角、转小写
         txt = re.sub(
             r"[ :|\r\n\t,，。？?/`!！&^%%()\[\]{}<>]+",
             " ",
             rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
         ).strip()
-        txt = FulltextQueryer.rmWWW(txt)
+        txt = FulltextQueryer.rmWWW(txt) # 去除停用词
 
+        # 2. 非中文文本处理
         if not self.isChinese(txt):
             txt = FulltextQueryer.rmWWW(txt)
             tks = rag_tokenizer.tokenize(txt).split()
@@ -117,30 +131,43 @@ class FulltextQueryer:
             ), keywords
 
         def need_fine_grained_tokenize(tk):
+            """
+            判断是否需要细粒度分词
+            参数:
+                tk: 待判断的词条
+            返回:
+                bool: True表示需要细粒度分词
+            """
             if len(tk) < 3:
                 return False
             if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
                 return False
             return True
 
-        txt = FulltextQueryer.rmWWW(txt)
-        qs, keywords = [], []
+        txt = FulltextQueryer.rmWWW(txt)  # 二次去除停用词
+        qs, keywords = [], [] # 初始化查询表达式和关键词列表
+        # 3. 中文文本处理（最多处理256个词）
         for tt in self.tw.split(txt)[:256]:  # .split():
             if not tt:
                 continue
+            # 3.1 基础关键词收集
             keywords.append(tt)
-            twts = self.tw.weights([tt])
-            syns = self.syn.lookup(tt)
+            twts = self.tw.weights([tt]) # 获取词权重
+            syns = self.syn.lookup(tt)  # 查询同义词
+            # 3.2 同义词扩展（最多扩展到32个关键词）
             if syns and len(keywords) < 32:
                 keywords.extend(syns)
             logging.debug(json.dumps(twts, ensure_ascii=False))
             tms = []
+             # 3.3 处理每个词及其权重
             for tk, w in sorted(twts, key=lambda x: x[1] * -1):
+                # 3.3.1 细粒度分词处理
                 sm = (
                     rag_tokenizer.fine_grained_tokenize(tk).split()
                     if need_fine_grained_tokenize(tk)
                     else []
                 )
+                # 3.3.2 清洗分词结果
                 sm = [
                     re.sub(
                         r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|，。；‘’【】、！￥……（）——《》？：“”-]+",
@@ -151,36 +178,41 @@ class FulltextQueryer:
                 ]
                 sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
                 sm = [m for m in sm if len(m) > 1]
-
+                # 3.3.3 收集关键词（不超过32个）
                 if len(keywords) < 32:
                     keywords.append(re.sub(r"[ \\\"']+", "", tk))
                     keywords.extend(sm)
-
+                    
+                # 3.3.4 同义词处理
                 tk_syns = self.syn.lookup(tk)
                 tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
                 if len(keywords) < 32:
                     keywords.extend([s for s in tk_syns if s])
                 tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
                 tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns]
-
+                # 关键词数量限制
                 if len(keywords) >= 32:
                     break
-
+                
+                # 3.3.5 构建查询表达式
                 tk = FulltextQueryer.subSpecialChar(tk)
                 if tk.find(" ") > 0:
-                    tk = '"%s"' % tk
+                    tk = '"%s"' % tk # 处理短语查询
                 if tk_syns:
-                    tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns)
+                    tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns)  # 添加同义词查询
                 if sm:
-                    tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
+                    tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm)) # 添加细粒度分词查询
                 if tk.strip():
-                    tms.append((tk, w))
-
+                    tms.append((tk, w))   # 保存带权重的查询表达式
+            
+            # 3.4 合并当前词的查询表达式
             tms = " ".join([f"({t})^{w}" for t, w in tms])
 
+            # 3.5 添加相邻词组合查询（提升短语匹配权重）
             if len(twts) > 1:
                 tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt)
 
+            # 3.6 处理同义词查询表达式
             syns = " OR ".join(
                 [
                     '"%s"'
@@ -191,9 +223,10 @@ class FulltextQueryer:
             if syns and tms:
                 tms = f"({tms})^5 OR ({syns})^0.7"
 
-            qs.append(tms)
+            qs.append(tms) # 添加到最终查询列表
 
-        if qs:
+        # 4. 生成最终查询表达式
+        if qs:  
             query = " OR ".join([f"({t})" for t in qs if t])
             return MatchTextExpr(
                 self.query_fields, query, 100, {"minimum_should_match": min_match}
diff --git a/rag/nlp/search.py b/rag/nlp/search.py
index 86416cd..a98c965 100644
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -70,14 +70,44 @@ class Dealer:
                highlight=False,
                rank_feature: dict | None = None
                ):
+        """
+        执行混合检索（全文检索+向量检索）
+        
+        参数:
+            req: 请求参数字典，包含：
+                - page: 页码
+                - topk: 返回结果最大数量
+                - size: 每页大小
+                - fields: 指定返回字段
+                - question: 查询问题文本
+                - similarity: 向量相似度阈值
+            idx_names: 索引名称或列表
+            kb_ids: 知识库ID列表
+            emb_mdl: 嵌入模型，用于向量检索
+            highlight: 是否返回高亮内容
+            rank_feature: 排序特征配置
+            
+        返回:
+            SearchResult对象，包含：
+                - total: 匹配总数
+                - ids: 匹配的chunk ID列表
+                - query_vector: 查询向量
+                - field: 各chunk的字段值
+                - highlight: 高亮内容
+                - aggregation: 聚合结果
+                - keywords: 提取的关键词
+        """
+        # 1. 初始化过滤条件和排序规则
         filters = self.get_filters(req)
         orderBy = OrderByExpr()
 
+        # 2. 处理分页参数
         pg = int(req.get("page", 1)) - 1
         topk = int(req.get("topk", 1024))
         ps = int(req.get("size", topk))
         offset, limit = pg * ps, ps
-
+        
+        # 3. 设置返回字段（默认包含文档名、内容等核心字段）
         src = req.get("fields",
                       ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
                        "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
@@ -85,9 +115,11 @@ class Dealer:
                        "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
         kwds = set([])
 
+        # 4. 处理查询问题
         qst = req.get("question", "")
         q_vec = []
         if not qst:
+             # 4.1 无查询文本时的处理（按文档排序）
             if req.get("sort"):
                 orderBy.asc("page_num_int")
                 orderBy.asc("top_int")
@@ -96,22 +128,29 @@ class Dealer:
             total = self.dataStore.getTotal(res)
             logging.debug("Dealer.search TOTAL: {}".format(total))
         else:
+             # 4.2 有查询文本时的处理
             highlightFields = ["content_ltks", "title_tks"] if highlight else []
+            
+            # 4.2.1 生成全文检索表达式和关键词
             matchText, keywords = self.qryr.question(qst, min_match=0.3)
             if emb_mdl is None:
+                # 4.2.2 纯全文检索模式
                 matchExprs = [matchText]
                 res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit,
                                             idx_names, kb_ids, rank_feature=rank_feature)
                 total = self.dataStore.getTotal(res)
                 logging.debug("Dealer.search TOTAL: {}".format(total))
             else:
+                 # 4.2.3 混合检索模式（全文+向量）
+                 # 生成查询向量
                 matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
                 q_vec = matchDense.embedding_data
                 src.append(f"q_{len(q_vec)}_vec")
-
+                # 设置混合检索权重（全文5% + 向量95%）
                 fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"})
                 matchExprs = [matchText, matchDense, fusionExpr]
 
+                # 执行混合检索
                 res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit,
                                             idx_names, kb_ids, rank_feature=rank_feature)
                 total = self.dataStore.getTotal(res)
@@ -340,48 +379,86 @@ class Dealer:
                   vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True,
                   rerank_mdl=None, highlight=False,
                   rank_feature: dict | None = {PAGERANK_FLD: 10}):
+        """
+        执行检索操作，根据问题查询相关文档片段
+        
+        参数说明:
+        - question: 用户输入的查询问题
+        - embd_mdl: 嵌入模型，用于将文本转换为向量
+        - tenant_ids: 租户ID，可以是字符串或列表
+        - kb_ids: 知识库ID列表
+        - page: 当前页码
+        - page_size: 每页结果数量
+        - similarity_threshold: 相似度阈值，低于此值的结果将被过滤
+        - vector_similarity_weight: 向量相似度权重
+        - top: 检索的最大结果数
+        - doc_ids: 文档ID列表，用于限制检索范围
+        - aggs: 是否聚合文档信息
+        - rerank_mdl: 重排序模型
+        - highlight: 是否高亮匹配内容
+        - rank_feature: 排序特征，如PageRank值
+        
+        返回:
+        包含检索结果的字典，包括总数、文档片段和文档聚合信息
+        """
+        # 初始化结果字典    
         ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
         if not question:
             return ranks
-
+        # 设置重排序页面限制
         RERANK_PAGE_LIMIT = 3
+        # 构建检索请求参数
         req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": max(page_size * RERANK_PAGE_LIMIT, 128),
                "question": question, "vector": True, "topk": top,
                "similarity": similarity_threshold,
                "available_int": 1}
-
+        
+         # 如果页码超过重排序限制，直接请求指定页的数据
         if page > RERANK_PAGE_LIMIT:
             req["page"] = page
             req["size"] = page_size
 
+        # 处理租户ID格式
         if isinstance(tenant_ids, str):
             tenant_ids = tenant_ids.split(",")
-
+        
+        # 执行搜索操作
         sres = self.search(req, [index_name(tid) for tid in tenant_ids],
                            kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
         ranks["total"] = sres.total
-
+        
+         # 根据页码决定是否需要重排序
         if page <= RERANK_PAGE_LIMIT:
+            # 前几页需要重排序以提高结果质量
             if rerank_mdl and sres.total > 0:
+                 # 使用重排序模型进行重排序
                 sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
                                                        sres, question, 1 - vector_similarity_weight,
                                                        vector_similarity_weight,
                                                        rank_feature=rank_feature)
             else:
+                # 使用默认方法进行重排序
                 sim, tsim, vsim = self.rerank(
                     sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
                     rank_feature=rank_feature)
+            # 根据相似度降序排序，并选择当前页的结果
             idx = np.argsort(sim * -1)[(page - 1) * page_size:page * page_size]
         else:
+            # 后续页面不需要重排序，直接使用搜索结果
             sim = tsim = vsim = [1] * len(sres.ids)
             idx = list(range(len(sres.ids)))
-
+        
+        # 获取向量维度和列名
         dim = len(sres.query_vector)
         vector_column = f"q_{dim}_vec"
         zero_vector = [0.0] * dim
+
+        # 处理每个检索结果
         for i in idx:
+            # 过滤低于阈值的结果
             if sim[i] < similarity_threshold:
                 break
+            # 控制返回结果数量
             if len(ranks["chunks"]) >= page_size:
                 if aggs:
                     continue
@@ -391,6 +468,7 @@ class Dealer:
             dnm = chunk.get("docnm_kwd", "")
             did = chunk.get("doc_id", "")
             position_int = chunk.get("position_int", [])
+            # 构建结果字典
             d = {
                 "chunk_id": id,
                 "content_ltks": chunk["content_ltks"],
@@ -406,6 +484,8 @@ class Dealer:
                 "vector": chunk.get(vector_column, zero_vector),
                 "positions": position_int,
             }
+            
+            # 处理高亮内容
             if highlight and sres.highlight:
                 if id in sres.highlight:
                     d["highlight"] = rmSpace(sres.highlight[id])
@@ -415,6 +495,7 @@ class Dealer:
             if dnm not in ranks["doc_aggs"]:
                 ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
             ranks["doc_aggs"][dnm]["count"] += 1
+        # 将文档聚合信息转换为列表格式，并按计数降序排序
         ranks["doc_aggs"] = [{"doc_name": k,
                               "doc_id": v["doc_id"],
                               "count": v["count"]} for k,
diff --git a/rag/nlp/term_weight.py b/rag/nlp/term_weight.py
index 6ab49a2..97c3d99 100644
--- a/rag/nlp/term_weight.py
+++ b/rag/nlp/term_weight.py
@@ -149,6 +149,14 @@ class Dealer:
             return res
 
     def split(self, txt):
+        """
+        特殊分词方法，主要处理连续英文单词的合并
+        参数:
+            txt: 待分词的文本字符串
+            
+        返回:
+            处理后的词条列表
+        """
         tks = []
         for t in re.sub(r"[ \t]+", " ", txt).split():
             if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
diff --git a/rag/prompts.py b/rag/prompts.py
index af6df16..87397b9 100644
--- a/rag/prompts.py
+++ b/rag/prompts.py
@@ -55,7 +55,18 @@ def llm_id2llm_type(llm_id):
 
 
 def message_fit_in(msg, max_length=4000):
+    """
+    调整消息列表使其token总数不超过max_length限制
+    
+    参数:
+        msg: 消息列表，每个元素为包含role和content的字典
+        max_length: 最大token数限制，默认4000
+        
+    返回:
+        tuple: (实际token数, 调整后的消息列表)
+    """
     def count():
+        """计算当前消息列表的总token数"""
         nonlocal msg
         tks_cnts = []
         for m in msg:
@@ -67,9 +78,11 @@ def message_fit_in(msg, max_length=4000):
         return total
 
     c = count()
+    # 如果不超限制，直接返回
     if c < max_length:
         return c, msg
-
+    
+    # 第一次精简：保留系统消息和最后一条消息
     msg_ = [m for m in msg if m["role"] == "system"]
     if len(msg) > 1:
         msg_.append(msg[-1])
@@ -77,15 +90,18 @@ def message_fit_in(msg, max_length=4000):
     c = count()
     if c < max_length:
         return c, msg
-
+    
+    # 计算系统消息和最后一条消息的token数
     ll = num_tokens_from_string(msg_[0]["content"])
     ll2 = num_tokens_from_string(msg_[-1]["content"])
+    # 如果系统消息占比超过80%，则截断系统消息
     if ll / (ll + ll2) > 0.8:
         m = msg_[0]["content"]
         m = encoder.decode(encoder.encode(m)[:max_length - ll2])
         msg[0]["content"] = m
         return max_length, msg
-
+        
+    # 否则截断最后一条消息
     m = msg_[-1]["content"]
     m = encoder.decode(encoder.encode(m)[:max_length - ll2])
     msg[-1]["content"] = m
@@ -93,6 +109,23 @@ def message_fit_in(msg, max_length=4000):
 
 
 def kb_prompt(kbinfos, max_tokens):
+    """
+    将检索到的知识库内容格式化为适合大语言模型的提示词
+    
+    参数:
+        kbinfos (dict): 检索结果，包含chunks等信息
+        max_tokens (int): 模型的最大token限制
+        
+    流程:
+        1. 提取所有检索到的文档片段内容
+        2. 计算token数量，确保不超过模型限制
+        3. 获取文档元数据
+        4. 按文档名组织文档片段
+        5. 格式化为结构化提示词
+        
+    返回:
+        list: 格式化后的知识库内容列表，每个元素是一个文档的相关信息
+    """
     knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
     used_token_count = 0
     chunks_num = 0
@@ -126,58 +159,56 @@ def kb_prompt(kbinfos, max_tokens):
 
 def citation_prompt():
     return """
+# 引用要求:
+- 以格式 '##i$$ ##j$$'插入引用，其中 i, j 是所引用内容的 ID，并用 '##' 和 '$$' 包裹。
+- 在句子末尾插入引用，每个句子最多 4 个引用。
+- 如果答案内容不来自检索到的文本块，则不要插入引用。
 
-# Citation requirements:
-- Inserts CITATIONS in format '##i$$ ##j$$' where i,j are the ID of the content you are citing and encapsulated with '##' and '$$'.
-- Inserts the CITATION symbols at the end of a sentence, AND NO MORE than 4 citations.
-- DO NOT insert CITATION in the answer if the content is not from retrieved chunks.
+--- 示例 ---
+<SYSTEM>: 以下是知识库:
 
---- Example START ---
-<SYSTEM>: Here is the knowledge base:
-
-Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ...
+Document: 埃隆·马斯克打破沉默谈加密货币，警告不要全仓狗狗币  ...
 URL: https://blockworks.co/news/elon-musk-crypto-dogecoin
 ID: 0
-The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said it’s still his favorite crypto...
+特斯拉联合创始人建议不要全仓投入 Dogecoin，但埃隆·马斯克表示它仍然是他最喜欢的加密货币...
 
-Document: Elon Musk's Dogecoin tweet sparks social media frenzy
+Document: 埃隆·马斯克关于狗狗币的推文引发社交媒体狂热
 ID: 1
-Musk said he is 'willing to serve' D.O.G.E. – shorthand for Dogecoin.
+马斯克表示他“愿意服务”D.O.G.E.——即 Dogecoin 的缩写。
 
-Document: Causal effect of Elon Musk tweets on Dogecoin price
+Document: 埃隆·马斯克推文对狗狗币价格的因果影响
 ID: 2
-If you think of Dogecoin — the cryptocurrency based on a meme — you can’t help but also think of Elon Musk...
+如果你想到 Dogecoin——这个基于表情包的加密货币，你就无法不想到埃隆·马斯克...
 
-Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services
+Document: 埃隆·马斯克推文点燃狗狗币在公共服务领域的未来前景
 ID: 3
-The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?...
+在埃隆·马斯克关于 Dogecoin 的公告后，市场正在升温。这是否意味着加密货币的新纪元？...
 
-      The above is the knowledge base.
+    以上是知识库。
 
-<USER>: What's the Elon's view on dogecoin?
+<USER>: 埃隆·马斯克对 Dogecoin 的看法是什么？
 
-<ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency ##0$$ ##1$$.
-Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services ##3$$.
-Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature.
+<ASSISTANT>: 马斯克一贯表达了对 Dogecoin 的喜爱，常常提及其幽默感和品牌中狗的元素。他曾表示这是他最喜欢的加密货币 ##0 ##1。
+最近，马斯克暗示 Dogecoin 未来可能会有新的应用场景。他的推文引发了关于 Dogecoin 可能被整合到公共服务中的猜测 ##3$$。
+总体而言，虽然马斯克喜欢 Dogecoin 并经常推广它，但他也警告不要过度投资，反映了他对其投机性质的既喜爱又谨慎的态度。
 
---- Example END ---
+--- 示例结束 ---
 
 """
 
 
 def keyword_extraction(chat_mdl, content, topn=3):
     prompt = f"""
-Role: You're a text analyzer. 
-Task: extract the most important keywords/phrases of a given piece of text content.
-Requirements: 
-  - Summarize the text content, and give top {topn} important keywords/phrases.
-  - The keywords MUST be in language of the given piece of text content.
-  - The keywords are delimited by ENGLISH COMMA.
-  - Keywords ONLY in output.
+角色：文本分析器
+任务：提取给定文本内容中最重要的关键词/短语
+要求：
+- 总结文本内容，给出前{topn}个重要关键词/短语
+- 关键词必须使用原文语言
+- 关键词之间用英文逗号分隔
+- 仅输出关键词
 
-### Text Content 
+### 文本内容
 {content}
-
 """
     msg = [
         {"role": "system", "content": prompt},
diff --git a/requirements.txt b/requirements.txt
index 6760684..8067b51 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,250 +1,115 @@
-academicagent==0.1.2
-accelerate==1.5.2
-aiohappyeyeballs==2.5.0
-aiohttp==3.11.13
-aiosignal==1.3.2
-annotated-types==0.7.0
-anyio==4.8.0
-async-timeout==4.0.3
-attrs==25.1.0
-backoff==2.2.1
-backports.tarfile==1.2.0
-backtrader==1.9.78.123
-beartype==0.20.0
-beautifulsoup4==4.13.3
-bs4==0.0.2
-cachetools==5.5.2
-cbor==1.0.0
-certifi==2025.1.31
-cffi==1.17.1
-chardet==5.2.0
-charset-normalizer==3.4.1
-click==8.1.8
-cn2an==0.5.23
-cnki-agent==0.1.2
-CnkiSpider==1.1.0
-colorama==0.4.6
-coloredlogs==15.0.1
-colpali_engine==0.3.8
-contourpy==1.3.1
-cramjam==2.9.1
-cryptography==44.0.2
-csscompressor==0.9.5
-cssselect==1.3.0
-cssutils==2.11.1
-ctranslate2==4.5.0
-cycler==0.12.1
-dashscope==1.22.2
-dataclasses-json==0.6.7
-DataRecorder==3.6.2
-datasets==3.4.0
 datrie==0.8.2
-dill==0.3.8
-diskcache==5.6.3
-distro==1.9.0
-docutils==0.21.2
-DownloadKit==2.0.7
-DrissionPage==4.1.0.17
-einops==0.8.1
-elastic-transport==8.17.1
-elasticsearch==8.17.2
-elasticsearch-dsl==8.17.1
-et_xmlfile==2.0.0
-evaluate==0.4.3
-exceptiongroup==1.2.2
-fastapi==0.115.11
-fastparquet==2024.11.0
-filelock==3.17.0
-FlagEmbedding==1.3.4
-flatbuffers==25.2.10
-fonttools==4.56.0
-frozenlist==1.5.0
-fsspec==2024.12.0
-google-ai-generativelanguage==0.6.15
-google-api-core==2.24.2
-google-api-python-client==2.164.0
-google-auth==2.38.0
-google-auth-httplib2==0.2.0
-google-generativeai==0.8.4
-googleapis-common-protos==1.69.1
-GPUtil==1.4.0
-greenlet==3.1.1
-grpcio==1.71.0
-grpcio-status==1.71.0
-h11==0.14.0
+akshare>=1.15.78,<2.0.0
+azure-storage-blob==12.22.0
+azure-identity==1.17.1
+azure-storage-file-datalake==12.16.0
+anthropic==0.34.1
+arxiv==2.1.3
+aspose-slides>=24.9.0,<25.0.0; platform_machine == 'x86_64' or (sys_platform == 'darwin' and platform_machine == 'arm64')
+beartype>=0.18.5,<0.19.0
+bio==1.7.1
+blinker==1.7.0
+boto3==1.34.140
+botocore==1.34.140
+cachetools==5.3.3
+chardet==5.2.0
+cn2an==0.5.22
+cohere==5.6.2
+Crawl4AI==0.3.8
+dashscope==1.20.11
+deepl==1.18.0
+demjson3==3.0.6
+discord-py==2.3.2
+duckduckgo-search>=7.2.0,<8.0.0
+editdistance==0.8.1
+elastic-transport==8.12.0
+elasticsearch==8.12.1
+elasticsearch-dsl==8.12.0
+filelock==3.15.4
+flask==3.0.3
+flask-cors==5.0.0
+flask-login==0.6.3
+flask-session==0.8.0
+google-search-results==2.4.2
+groq==0.9.0
 hanziconv==0.3.2
-hf_transfer==0.1.9
-html-minifier==0.0.4
-httpcore==1.0.7
-httplib2==0.22.0
-httptools==0.6.4
-httpx==0.28.1
-httpx-sse==0.4.0
-huggingface-hub==0.29.3
-humanfriendly==10.0
-id==1.5.0
-idna==3.10
-ijson==3.3.0
-importlib_metadata==8.6.1
-infinity-sdk==0.6.0.dev3
-infinity_emb==0.0.75
-iniconfig==2.0.0
-inscriptis==2.5.3
-ir_datasets==0.5.10
-jaraco.classes==3.4.0
-jaraco.context==6.0.1
-jaraco.functools==4.1.0
-Jinja2==3.1.6
-jiter==0.9.0
-joblib==1.4.2
-jsmin==3.0.1
-json_repair==0.39.1
-jsonpatch==1.33
-jsonpointer==3.0.0
-keyring==25.6.0
-kiwisolver==1.4.8
-langchain==0.3.20
-langchain-community==0.3.19
-langchain-core==0.3.41
-langchain-ollama==0.2.3
-langchain-text-splitters==0.3.6
-langsmith==0.3.12
-lxml==5.3.1
-lz4==4.4.3
-markdown-it-py==3.0.0
-MarkupSafe==3.0.2
-marshmallow==3.26.1
-matplotlib==3.10.0
-mdurl==0.1.2
-monotonic==1.6
-more-itertools==10.6.0
-mpmath==1.3.0
-multidict==6.1.0
-multiprocess==0.70.16
-mypy-extensions==1.0.0
-mysql==0.0.3
-mysql-connector-python==9.2.0
-mysqlclient==2.2.7
-networkx==3.4.2
-nh3==0.2.21
+html-text==0.6.2
+httpx==0.27.0
+huggingface-hub>=0.25.0,<0.26.0
+infinity-sdk==0.6.0-dev3
+infinity-emb>=0.0.66,<0.0.67
+itsdangerous==2.1.2
+json-repair==0.35.0
+markdown==3.6
+markdown-to-json==2.1.1
+minio==7.2.4
+mistralai==0.4.2
 nltk==3.9.1
-numpy==1.26.4
-ollama==0.4.7
-onnx==1.17.0
-onnxruntime==1.21.0
-openai==1.66.3
-openpyxl==3.1.5
-optimum==1.24.0
-orjson==3.10.15
-ormsgpack==1.8.0
-outcome==1.3.0.post0
-packaging==24.2
-pandas==2.2.3
-pdfminer.six==20231228
-pdfplumber==0.11.5
-peft==0.14.0
-pillow==11.1.0
-pluggy==1.5.0
-polars-lts-cpu==1.9.0
-posthog==3.20.0
-proces==0.1.7
-prometheus-fastapi-instrumentator==7.0.2
-prometheus_client==0.21.1
-propcache==0.3.0
-proto-plus==1.26.1
-protobuf==5.29.3
-psutil==7.0.0
-pyarrow==17.0.0
-pyasn1==0.6.1
-pyasn1_modules==0.4.1
-pycparser==2.22
-pycryptodome==3.21.0
+numpy>=1.26.0,<2.0.0
+ollama==0.2.1
+onnxruntime==1.19.2; sys_platform == 'darwin' or platform_machine != 'x86_64'
+onnxruntime-gpu==1.19.2; sys_platform != 'darwin' and platform_machine == 'x86_64'
+openai==1.45.0
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+openpyxl>=3.1.0,<4.0.0
+ormsgpack==1.5.0
+pandas>=2.2.0,<3.0.0
+pdfplumber==0.10.4
+peewee==3.17.1
+pillow==10.4.0
+protobuf==5.27.2
+psycopg2-binary==2.9.9
+pyclipper==1.3.0.post5
 pycryptodomex==3.20.0
-pydantic==2.9.2
-pydantic-settings==2.8.1
-pydantic_core==2.23.4
-Pygments==2.19.1
-PyJWT==2.8.0
-PyMuPDF==1.25.3
-PyMySQL==1.1.1
-pyparsing==3.2.1
-pypdfium2==4.30.1
-pyreadline3==3.5.4
-PySocks==1.7.1
-pytest==8.3.5
-python-dateutil==2.9.0.post0
+pypdf>=5.0.0,<6.0.0
+pytest>=8.3.0,<9.0.0
 python-dotenv==1.0.1
-pytz==2025.1
-pywin32-ctypes==0.2.3
-PyYAML==6.0.2
-readerwriterlock==1.0.9
-readme_renderer==44.0
-regex==2024.11.6
-requests==2.32.3
-requests-file==2.1.0
-requests-toolbelt==1.0.0
-rfc3986==2.0.0
-rich==13.9.4
+python-dateutil==2.8.2
+python-pptx>=1.0.2,<2.0.0
+pywencai==0.12.2
+qianfan==0.4.6
+ranx==0.3.20
+readability-lxml==0.8.1
+valkey==6.0.2
+requests==2.32.2
+replicate==0.31.0
 roman-numbers==1.0.2
-rsa==4.9
-ruamel.yaml==0.18.10
-ruamel.yaml.clib==0.2.12
-safetensors==0.5.3
-scikit-learn==1.6.1
-scipy==1.15.2
-selenium==4.29.0
-sentence-transformers==3.4.1
-sentencepiece==0.2.0
-shellingham==1.5.4
-simplejson==3.20.1
-six==1.17.0
-sniffio==1.3.1
-sortedcontainers==2.4.0
-soundfile==0.12.1
-soupsieve==2.6
-SQLAlchemy==2.0.38
-sqlglot==11.7.1
-starlette==0.46.1
-StrEnum==0.4.15
-sympy==1.13.1
-tenacity==9.0.0
-threadpoolctl==3.6.0
-thrift==0.20.0
-tiktoken==0.9.0
-timm==1.0.15
-tldextract==5.1.3
-tokenizers==0.21.1
-tomli==2.2.1
-torch==2.6.0
-torchvision==0.21.0
-tqdm==4.67.1
-transformers==4.47.1
-trec-car-tools==2.6
-trio==0.29.0
-trio-websocket==0.12.2
-tushare==1.4.18
-twine==6.1.0
-typer==0.12.5
-typing-inspect==0.9.0
-typing_extensions==4.12.2
-tzdata==2025.1
-unlzw3==0.2.3
-uritemplate==4.1.1
-urllib3==2.3.0
-uvicorn==0.32.1
-valkey==6.1.0
-warc3-wet==0.2.5
-warc3-wet-clueweb09==0.2.5
-watchfiles==1.0.4
-webdriver-manager==4.0.2
-websocket-client==1.8.0
-websockets==15.0.1
-Werkzeug==3.1.3
+ruamel-base==1.0.0
+scholarly==1.7.11
+scikit-learn==1.5.0
+selenium==4.22.0
+selenium-wire==5.1.0
+setuptools>=75.2.0,<76.0.0
+shapely==2.0.5
+six==1.16.0
+strenum==0.4.15
+tabulate==0.9.0
+tavily-python==0.5.1
+tencentcloud-sdk-python==3.0.1215
+tika==2.6.0
+tiktoken==0.7.0
+umap_learn==0.5.6
+vertexai==1.64.0
+volcengine==1.0.146
+voyageai==0.2.3
+webdriver-manager==4.0.1
+werkzeug==3.0.6
+wikipedia==1.4.0
 word2number==1.1
-wsproto==1.2.0
-xxhash==3.5.0
-yarl==1.18.3
-zhipuai==2.1.5.20250106
-zipp==3.21.0
-zlib-state==0.1.9
-zstandard==0.23.0
\ No newline at end of file
+xgboost==1.5.0
+xpinyin==0.7.6
+yfinance==0.1.96
+zhipuai==2.0.1
+ruamel-yaml>=0.18.6,<0.19.0
+google-generativeai>=0.8.1,<0.9.0
+python-docx>=1.1.2,<2.0.0
+pypdf2>=3.0.1,<4.0.0
+graspologic>=3.4.1,<4.0.0
+pymysql>=1.1.1,<2.0.0
+mini-racer>=0.12.4,<0.13.0
+pyodbc>=5.2.0,<6.0.0
+pyicu>=2.13.1,<3.0.0
+flasgger>=0.9.7.1,<0.10.0
+xxhash>=3.5.0,<4.0.0
+trio>=0.29.0
\ No newline at end of file
diff --git a/web/src/pages/chat/chat-configuration-modal/assistant-setting.tsx b/web/src/pages/chat/chat-configuration-modal/assistant-setting.tsx
index 69a5b2c..cabbb28 100644
--- a/web/src/pages/chat/chat-configuration-modal/assistant-setting.tsx
+++ b/web/src/pages/chat/chat-configuration-modal/assistant-setting.tsx
@@ -95,7 +95,7 @@ const AssistantSetting = ({
       <Form.Item
         name={'language'}
         label={t('language')}
-        initialValue={'English'}
+        initialValue={'Chinese'}
         tooltip="coming soon"
         style={{ display: 'none' }}
       >