From a51b3168a099c1a09e29483157e07cb697f417d0 Mon Sep 17 00:00:00 2001 From: zstar <65890619+zstar1003@users.noreply.github.com> Date: Thu, 3 Apr 2025 21:00:49 +0800 Subject: [PATCH] Merge pull request #11 from zstar1003/dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit refactor: 增加了一些辅助理解的注释 --- api/db/services/dialog_service.py | 42 ++- rag/nlp/query.py | 65 +++- rag/nlp/search.py | 95 ++++- rag/nlp/term_weight.py | 8 + rag/prompts.py | 97 +++-- requirements.txt | 351 ++++++------------ .../assistant-setting.tsx | 2 +- 7 files changed, 356 insertions(+), 304 deletions(-) diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py index e7d7118..d8500a1 100644 --- a/api/db/services/dialog_service.py +++ b/api/db/services/dialog_service.py @@ -239,6 +239,7 @@ def chat(dialog, messages, stream=True, **kwargs): prompt4citation = "" if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)): prompt4citation = citation_prompt() + # 过滤掉 system 角色的消息(因为前面已经单独处理了系统消息) msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])} for m in messages if m["role"] != "system"]) used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.95)) @@ -309,16 +310,20 @@ def chat(dialog, messages, stream=True, **kwargs): return {"answer": think+answer, "reference": refs, "prompt": re.sub(r"\n", " \n", prompt), "created_at": time.time()} if stream: - last_ans = "" - answer = "" + last_ans = "" # 记录上一次返回的完整回答 + answer = "" # 当前累计的完整回答 for ans in chat_mdl.chat_streamly(prompt+prompt4citation, msg[1:], gen_conf): + # 如果存在思考过程(thought),移除相关标记 if thought: ans = re.sub(r".*", "", ans, flags=re.DOTALL) answer = ans + # 计算新增的文本片段(delta) delta_ans = ans[len(last_ans):] + # 如果新增token太少(小于16),跳过本次返回(避免频繁发送小片段) if num_tokens_from_string(delta_ans) < 16: continue last_ans = answer + # 返回当前累计回答(包含思考过程)+新增片段) yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)} delta_ans = answer[len(last_ans):] if delta_ans: @@ -471,20 +476,48 @@ def tts(tts_mdl, text): def ask(question, kb_ids, tenant_id): + """ + 处理用户搜索请求,从知识库中检索相关信息并生成回答 + + 参数: + question (str): 用户的问题或查询 + kb_ids (list): 知识库ID列表,指定要搜索的知识库 + tenant_id (str): 租户ID,用于权限控制和资源隔离 + + 流程: + 1. 获取指定知识库的信息 + 2. 确定使用的嵌入模型 + 3. 根据知识库类型选择检索器(普通检索器或知识图谱检索器) + 4. 初始化嵌入模型和聊天模型 + 5. 执行检索操作获取相关文档片段 + 6. 格式化知识库内容作为上下文 + 7. 构建系统提示词 + 8. 生成回答并添加引用标记 + 9. 流式返回生成的回答 + + 返回: + generator: 生成器对象,产生包含回答和引用信息的字典 + """ + kbs = KnowledgebaseService.get_by_ids(kb_ids) embedding_list = list(set([kb.embd_id for kb in kbs])) is_knowledge_graph = all([kb.parser_id == ParserType.KG for kb in kbs]) retriever = settings.retrievaler if not is_knowledge_graph else settings.kg_retrievaler - + # 初始化嵌入模型,用于将文本转换为向量表示 embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING, embedding_list[0]) + # 初始化聊天模型,用于生成回答 chat_mdl = LLMBundle(tenant_id, LLMType.CHAT) + # 获取聊天模型的最大token长度,用于控制上下文长度 max_tokens = chat_mdl.max_length + # 获取所有知识库的租户ID并去重 tenant_ids = list(set([kb.tenant_id for kb in kbs])) + # 调用检索器检索相关文档片段 kbinfos = retriever.retrieval(question, embd_mdl, tenant_ids, kb_ids, 1, 12, 0.1, 0.3, aggs=False, rank_feature=label_question(question, kbs) ) + # 将检索结果格式化为提示词,并确保不超过模型最大token限制 knowledges = kb_prompt(kbinfos, max_tokens) prompt = """ Role: You're a smart assistant. Your name is Miss R. @@ -504,6 +537,7 @@ def ask(question, kb_ids, tenant_id): """ % "\n".join(knowledges) msg = [{"role": "user", "content": question}] + # 生成完成后添加回答中的引用标记 def decorate_answer(answer): nonlocal knowledges, kbinfos, prompt answer, idx = retriever.insert_citations(answer, @@ -534,4 +568,4 @@ def ask(question, kb_ids, tenant_id): for ans in chat_mdl.chat_streamly(prompt, msg, {"temperature": 0.1}): answer = ans yield {"answer": answer, "reference": {}} - yield decorate_answer(answer) + yield decorate_answer(answer) \ No newline at end of file diff --git a/rag/nlp/query.py b/rag/nlp/query.py index b58efb8..8e1aca3 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -71,13 +71,27 @@ class FulltextQueryer: return txt def question(self, txt, tbl="qa", min_match: float = 0.6): + """ + 处理用户问题并生成全文检索表达式 + + 参数: + txt: 原始问题文本 + tbl: 查询表名(默认"qa") + min_match: 最小匹配阈值(默认0.6) + + 返回: + MatchTextExpr: 全文检索表达式对象 + list: 提取的关键词列表 + """ + # 1. 文本预处理:去除特殊字符、繁体转简体、全角转半角、转小写 txt = re.sub( r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+", " ", rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())), ).strip() - txt = FulltextQueryer.rmWWW(txt) + txt = FulltextQueryer.rmWWW(txt) # 去除停用词 + # 2. 非中文文本处理 if not self.isChinese(txt): txt = FulltextQueryer.rmWWW(txt) tks = rag_tokenizer.tokenize(txt).split() @@ -117,30 +131,43 @@ class FulltextQueryer: ), keywords def need_fine_grained_tokenize(tk): + """ + 判断是否需要细粒度分词 + 参数: + tk: 待判断的词条 + 返回: + bool: True表示需要细粒度分词 + """ if len(tk) < 3: return False if re.match(r"[0-9a-z\.\+#_\*-]+$", tk): return False return True - txt = FulltextQueryer.rmWWW(txt) - qs, keywords = [], [] + txt = FulltextQueryer.rmWWW(txt) # 二次去除停用词 + qs, keywords = [], [] # 初始化查询表达式和关键词列表 + # 3. 中文文本处理(最多处理256个词) for tt in self.tw.split(txt)[:256]: # .split(): if not tt: continue + # 3.1 基础关键词收集 keywords.append(tt) - twts = self.tw.weights([tt]) - syns = self.syn.lookup(tt) + twts = self.tw.weights([tt]) # 获取词权重 + syns = self.syn.lookup(tt) # 查询同义词 + # 3.2 同义词扩展(最多扩展到32个关键词) if syns and len(keywords) < 32: keywords.extend(syns) logging.debug(json.dumps(twts, ensure_ascii=False)) tms = [] + # 3.3 处理每个词及其权重 for tk, w in sorted(twts, key=lambda x: x[1] * -1): + # 3.3.1 细粒度分词处理 sm = ( rag_tokenizer.fine_grained_tokenize(tk).split() if need_fine_grained_tokenize(tk) else [] ) + # 3.3.2 清洗分词结果 sm = [ re.sub( r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+", @@ -151,36 +178,41 @@ class FulltextQueryer: ] sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1] sm = [m for m in sm if len(m) > 1] - + # 3.3.3 收集关键词(不超过32个) if len(keywords) < 32: keywords.append(re.sub(r"[ \\\"']+", "", tk)) keywords.extend(sm) - + + # 3.3.4 同义词处理 tk_syns = self.syn.lookup(tk) tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns] if len(keywords) < 32: keywords.extend([s for s in tk_syns if s]) tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s] tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns] - + # 关键词数量限制 if len(keywords) >= 32: break - + + # 3.3.5 构建查询表达式 tk = FulltextQueryer.subSpecialChar(tk) if tk.find(" ") > 0: - tk = '"%s"' % tk + tk = '"%s"' % tk # 处理短语查询 if tk_syns: - tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns) + tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns) # 添加同义词查询 if sm: - tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm)) + tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm)) # 添加细粒度分词查询 if tk.strip(): - tms.append((tk, w)) - + tms.append((tk, w)) # 保存带权重的查询表达式 + + # 3.4 合并当前词的查询表达式 tms = " ".join([f"({t})^{w}" for t, w in tms]) + # 3.5 添加相邻词组合查询(提升短语匹配权重) if len(twts) > 1: tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt) + # 3.6 处理同义词查询表达式 syns = " OR ".join( [ '"%s"' @@ -191,9 +223,10 @@ class FulltextQueryer: if syns and tms: tms = f"({tms})^5 OR ({syns})^0.7" - qs.append(tms) + qs.append(tms) # 添加到最终查询列表 - if qs: + # 4. 生成最终查询表达式 + if qs: query = " OR ".join([f"({t})" for t in qs if t]) return MatchTextExpr( self.query_fields, query, 100, {"minimum_should_match": min_match} diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 86416cd..a98c965 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -70,14 +70,44 @@ class Dealer: highlight=False, rank_feature: dict | None = None ): + """ + 执行混合检索(全文检索+向量检索) + + 参数: + req: 请求参数字典,包含: + - page: 页码 + - topk: 返回结果最大数量 + - size: 每页大小 + - fields: 指定返回字段 + - question: 查询问题文本 + - similarity: 向量相似度阈值 + idx_names: 索引名称或列表 + kb_ids: 知识库ID列表 + emb_mdl: 嵌入模型,用于向量检索 + highlight: 是否返回高亮内容 + rank_feature: 排序特征配置 + + 返回: + SearchResult对象,包含: + - total: 匹配总数 + - ids: 匹配的chunk ID列表 + - query_vector: 查询向量 + - field: 各chunk的字段值 + - highlight: 高亮内容 + - aggregation: 聚合结果 + - keywords: 提取的关键词 + """ + # 1. 初始化过滤条件和排序规则 filters = self.get_filters(req) orderBy = OrderByExpr() + # 2. 处理分页参数 pg = int(req.get("page", 1)) - 1 topk = int(req.get("topk", 1024)) ps = int(req.get("size", topk)) offset, limit = pg * ps, ps - + + # 3. 设置返回字段(默认包含文档名、内容等核心字段) src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int", "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", @@ -85,9 +115,11 @@ class Dealer: "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD]) kwds = set([]) + # 4. 处理查询问题 qst = req.get("question", "") q_vec = [] if not qst: + # 4.1 无查询文本时的处理(按文档排序) if req.get("sort"): orderBy.asc("page_num_int") orderBy.asc("top_int") @@ -96,22 +128,29 @@ class Dealer: total = self.dataStore.getTotal(res) logging.debug("Dealer.search TOTAL: {}".format(total)) else: + # 4.2 有查询文本时的处理 highlightFields = ["content_ltks", "title_tks"] if highlight else [] + + # 4.2.1 生成全文检索表达式和关键词 matchText, keywords = self.qryr.question(qst, min_match=0.3) if emb_mdl is None: + # 4.2.2 纯全文检索模式 matchExprs = [matchText] res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids, rank_feature=rank_feature) total = self.dataStore.getTotal(res) logging.debug("Dealer.search TOTAL: {}".format(total)) else: + # 4.2.3 混合检索模式(全文+向量) + # 生成查询向量 matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1)) q_vec = matchDense.embedding_data src.append(f"q_{len(q_vec)}_vec") - + # 设置混合检索权重(全文5% + 向量95%) fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"}) matchExprs = [matchText, matchDense, fusionExpr] + # 执行混合检索 res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids, rank_feature=rank_feature) total = self.dataStore.getTotal(res) @@ -340,48 +379,86 @@ class Dealer: vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False, rank_feature: dict | None = {PAGERANK_FLD: 10}): + """ + 执行检索操作,根据问题查询相关文档片段 + + 参数说明: + - question: 用户输入的查询问题 + - embd_mdl: 嵌入模型,用于将文本转换为向量 + - tenant_ids: 租户ID,可以是字符串或列表 + - kb_ids: 知识库ID列表 + - page: 当前页码 + - page_size: 每页结果数量 + - similarity_threshold: 相似度阈值,低于此值的结果将被过滤 + - vector_similarity_weight: 向量相似度权重 + - top: 检索的最大结果数 + - doc_ids: 文档ID列表,用于限制检索范围 + - aggs: 是否聚合文档信息 + - rerank_mdl: 重排序模型 + - highlight: 是否高亮匹配内容 + - rank_feature: 排序特征,如PageRank值 + + 返回: + 包含检索结果的字典,包括总数、文档片段和文档聚合信息 + """ + # 初始化结果字典 ranks = {"total": 0, "chunks": [], "doc_aggs": {}} if not question: return ranks - + # 设置重排序页面限制 RERANK_PAGE_LIMIT = 3 + # 构建检索请求参数 req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": max(page_size * RERANK_PAGE_LIMIT, 128), "question": question, "vector": True, "topk": top, "similarity": similarity_threshold, "available_int": 1} - + + # 如果页码超过重排序限制,直接请求指定页的数据 if page > RERANK_PAGE_LIMIT: req["page"] = page req["size"] = page_size + # 处理租户ID格式 if isinstance(tenant_ids, str): tenant_ids = tenant_ids.split(",") - + + # 执行搜索操作 sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight, rank_feature=rank_feature) ranks["total"] = sres.total - + + # 根据页码决定是否需要重排序 if page <= RERANK_PAGE_LIMIT: + # 前几页需要重排序以提高结果质量 if rerank_mdl and sres.total > 0: + # 使用重排序模型进行重排序 sim, tsim, vsim = self.rerank_by_model(rerank_mdl, sres, question, 1 - vector_similarity_weight, vector_similarity_weight, rank_feature=rank_feature) else: + # 使用默认方法进行重排序 sim, tsim, vsim = self.rerank( sres, question, 1 - vector_similarity_weight, vector_similarity_weight, rank_feature=rank_feature) + # 根据相似度降序排序,并选择当前页的结果 idx = np.argsort(sim * -1)[(page - 1) * page_size:page * page_size] else: + # 后续页面不需要重排序,直接使用搜索结果 sim = tsim = vsim = [1] * len(sres.ids) idx = list(range(len(sres.ids))) - + + # 获取向量维度和列名 dim = len(sres.query_vector) vector_column = f"q_{dim}_vec" zero_vector = [0.0] * dim + + # 处理每个检索结果 for i in idx: + # 过滤低于阈值的结果 if sim[i] < similarity_threshold: break + # 控制返回结果数量 if len(ranks["chunks"]) >= page_size: if aggs: continue @@ -391,6 +468,7 @@ class Dealer: dnm = chunk.get("docnm_kwd", "") did = chunk.get("doc_id", "") position_int = chunk.get("position_int", []) + # 构建结果字典 d = { "chunk_id": id, "content_ltks": chunk["content_ltks"], @@ -406,6 +484,8 @@ class Dealer: "vector": chunk.get(vector_column, zero_vector), "positions": position_int, } + + # 处理高亮内容 if highlight and sres.highlight: if id in sres.highlight: d["highlight"] = rmSpace(sres.highlight[id]) @@ -415,6 +495,7 @@ class Dealer: if dnm not in ranks["doc_aggs"]: ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} ranks["doc_aggs"][dnm]["count"] += 1 + # 将文档聚合信息转换为列表格式,并按计数降序排序 ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k, diff --git a/rag/nlp/term_weight.py b/rag/nlp/term_weight.py index 6ab49a2..97c3d99 100644 --- a/rag/nlp/term_weight.py +++ b/rag/nlp/term_weight.py @@ -149,6 +149,14 @@ class Dealer: return res def split(self, txt): + """ + 特殊分词方法,主要处理连续英文单词的合并 + 参数: + txt: 待分词的文本字符串 + + 返回: + 处理后的词条列表 + """ tks = [] for t in re.sub(r"[ \t]+", " ", txt).split(): if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \ diff --git a/rag/prompts.py b/rag/prompts.py index af6df16..87397b9 100644 --- a/rag/prompts.py +++ b/rag/prompts.py @@ -55,7 +55,18 @@ def llm_id2llm_type(llm_id): def message_fit_in(msg, max_length=4000): + """ + 调整消息列表使其token总数不超过max_length限制 + + 参数: + msg: 消息列表,每个元素为包含role和content的字典 + max_length: 最大token数限制,默认4000 + + 返回: + tuple: (实际token数, 调整后的消息列表) + """ def count(): + """计算当前消息列表的总token数""" nonlocal msg tks_cnts = [] for m in msg: @@ -67,9 +78,11 @@ def message_fit_in(msg, max_length=4000): return total c = count() + # 如果不超限制,直接返回 if c < max_length: return c, msg - + + # 第一次精简:保留系统消息和最后一条消息 msg_ = [m for m in msg if m["role"] == "system"] if len(msg) > 1: msg_.append(msg[-1]) @@ -77,15 +90,18 @@ def message_fit_in(msg, max_length=4000): c = count() if c < max_length: return c, msg - + + # 计算系统消息和最后一条消息的token数 ll = num_tokens_from_string(msg_[0]["content"]) ll2 = num_tokens_from_string(msg_[-1]["content"]) + # 如果系统消息占比超过80%,则截断系统消息 if ll / (ll + ll2) > 0.8: m = msg_[0]["content"] m = encoder.decode(encoder.encode(m)[:max_length - ll2]) msg[0]["content"] = m return max_length, msg - + + # 否则截断最后一条消息 m = msg_[-1]["content"] m = encoder.decode(encoder.encode(m)[:max_length - ll2]) msg[-1]["content"] = m @@ -93,6 +109,23 @@ def message_fit_in(msg, max_length=4000): def kb_prompt(kbinfos, max_tokens): + """ + 将检索到的知识库内容格式化为适合大语言模型的提示词 + + 参数: + kbinfos (dict): 检索结果,包含chunks等信息 + max_tokens (int): 模型的最大token限制 + + 流程: + 1. 提取所有检索到的文档片段内容 + 2. 计算token数量,确保不超过模型限制 + 3. 获取文档元数据 + 4. 按文档名组织文档片段 + 5. 格式化为结构化提示词 + + 返回: + list: 格式化后的知识库内容列表,每个元素是一个文档的相关信息 + """ knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] used_token_count = 0 chunks_num = 0 @@ -126,58 +159,56 @@ def kb_prompt(kbinfos, max_tokens): def citation_prompt(): return """ +# 引用要求: +- 以格式 '##i$$ ##j$$'插入引用,其中 i, j 是所引用内容的 ID,并用 '##' 和 '$$' 包裹。 +- 在句子末尾插入引用,每个句子最多 4 个引用。 +- 如果答案内容不来自检索到的文本块,则不要插入引用。 -# Citation requirements: -- Inserts CITATIONS in format '##i$$ ##j$$' where i,j are the ID of the content you are citing and encapsulated with '##' and '$$'. -- Inserts the CITATION symbols at the end of a sentence, AND NO MORE than 4 citations. -- DO NOT insert CITATION in the answer if the content is not from retrieved chunks. +--- 示例 --- +: 以下是知识库: ---- Example START --- -: Here is the knowledge base: - -Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ... +Document: 埃隆·马斯克打破沉默谈加密货币,警告不要全仓狗狗币 ... URL: https://blockworks.co/news/elon-musk-crypto-dogecoin ID: 0 -The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said it’s still his favorite crypto... +特斯拉联合创始人建议不要全仓投入 Dogecoin,但埃隆·马斯克表示它仍然是他最喜欢的加密货币... -Document: Elon Musk's Dogecoin tweet sparks social media frenzy +Document: 埃隆·马斯克关于狗狗币的推文引发社交媒体狂热 ID: 1 -Musk said he is 'willing to serve' D.O.G.E. – shorthand for Dogecoin. +马斯克表示他“愿意服务”D.O.G.E.——即 Dogecoin 的缩写。 -Document: Causal effect of Elon Musk tweets on Dogecoin price +Document: 埃隆·马斯克推文对狗狗币价格的因果影响 ID: 2 -If you think of Dogecoin — the cryptocurrency based on a meme — you can’t help but also think of Elon Musk... +如果你想到 Dogecoin——这个基于表情包的加密货币,你就无法不想到埃隆·马斯克... -Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services +Document: 埃隆·马斯克推文点燃狗狗币在公共服务领域的未来前景 ID: 3 -The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?... +在埃隆·马斯克关于 Dogecoin 的公告后,市场正在升温。这是否意味着加密货币的新纪元?... - The above is the knowledge base. + 以上是知识库。 -: What's the Elon's view on dogecoin? +: 埃隆·马斯克对 Dogecoin 的看法是什么? -: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency ##0$$ ##1$$. -Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services ##3$$. -Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature. +: 马斯克一贯表达了对 Dogecoin 的喜爱,常常提及其幽默感和品牌中狗的元素。他曾表示这是他最喜欢的加密货币 ##0 ##1。 +最近,马斯克暗示 Dogecoin 未来可能会有新的应用场景。他的推文引发了关于 Dogecoin 可能被整合到公共服务中的猜测 ##3$$。 +总体而言,虽然马斯克喜欢 Dogecoin 并经常推广它,但他也警告不要过度投资,反映了他对其投机性质的既喜爱又谨慎的态度。 ---- Example END --- +--- 示例结束 --- """ def keyword_extraction(chat_mdl, content, topn=3): prompt = f""" -Role: You're a text analyzer. -Task: extract the most important keywords/phrases of a given piece of text content. -Requirements: - - Summarize the text content, and give top {topn} important keywords/phrases. - - The keywords MUST be in language of the given piece of text content. - - The keywords are delimited by ENGLISH COMMA. - - Keywords ONLY in output. +角色:文本分析器 +任务:提取给定文本内容中最重要的关键词/短语 +要求: +- 总结文本内容,给出前{topn}个重要关键词/短语 +- 关键词必须使用原文语言 +- 关键词之间用英文逗号分隔 +- 仅输出关键词 -### Text Content +### 文本内容 {content} - """ msg = [ {"role": "system", "content": prompt}, diff --git a/requirements.txt b/requirements.txt index 6760684..8067b51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,250 +1,115 @@ -academicagent==0.1.2 -accelerate==1.5.2 -aiohappyeyeballs==2.5.0 -aiohttp==3.11.13 -aiosignal==1.3.2 -annotated-types==0.7.0 -anyio==4.8.0 -async-timeout==4.0.3 -attrs==25.1.0 -backoff==2.2.1 -backports.tarfile==1.2.0 -backtrader==1.9.78.123 -beartype==0.20.0 -beautifulsoup4==4.13.3 -bs4==0.0.2 -cachetools==5.5.2 -cbor==1.0.0 -certifi==2025.1.31 -cffi==1.17.1 -chardet==5.2.0 -charset-normalizer==3.4.1 -click==8.1.8 -cn2an==0.5.23 -cnki-agent==0.1.2 -CnkiSpider==1.1.0 -colorama==0.4.6 -coloredlogs==15.0.1 -colpali_engine==0.3.8 -contourpy==1.3.1 -cramjam==2.9.1 -cryptography==44.0.2 -csscompressor==0.9.5 -cssselect==1.3.0 -cssutils==2.11.1 -ctranslate2==4.5.0 -cycler==0.12.1 -dashscope==1.22.2 -dataclasses-json==0.6.7 -DataRecorder==3.6.2 -datasets==3.4.0 datrie==0.8.2 -dill==0.3.8 -diskcache==5.6.3 -distro==1.9.0 -docutils==0.21.2 -DownloadKit==2.0.7 -DrissionPage==4.1.0.17 -einops==0.8.1 -elastic-transport==8.17.1 -elasticsearch==8.17.2 -elasticsearch-dsl==8.17.1 -et_xmlfile==2.0.0 -evaluate==0.4.3 -exceptiongroup==1.2.2 -fastapi==0.115.11 -fastparquet==2024.11.0 -filelock==3.17.0 -FlagEmbedding==1.3.4 -flatbuffers==25.2.10 -fonttools==4.56.0 -frozenlist==1.5.0 -fsspec==2024.12.0 -google-ai-generativelanguage==0.6.15 -google-api-core==2.24.2 -google-api-python-client==2.164.0 -google-auth==2.38.0 -google-auth-httplib2==0.2.0 -google-generativeai==0.8.4 -googleapis-common-protos==1.69.1 -GPUtil==1.4.0 -greenlet==3.1.1 -grpcio==1.71.0 -grpcio-status==1.71.0 -h11==0.14.0 +akshare>=1.15.78,<2.0.0 +azure-storage-blob==12.22.0 +azure-identity==1.17.1 +azure-storage-file-datalake==12.16.0 +anthropic==0.34.1 +arxiv==2.1.3 +aspose-slides>=24.9.0,<25.0.0; platform_machine == 'x86_64' or (sys_platform == 'darwin' and platform_machine == 'arm64') +beartype>=0.18.5,<0.19.0 +bio==1.7.1 +blinker==1.7.0 +boto3==1.34.140 +botocore==1.34.140 +cachetools==5.3.3 +chardet==5.2.0 +cn2an==0.5.22 +cohere==5.6.2 +Crawl4AI==0.3.8 +dashscope==1.20.11 +deepl==1.18.0 +demjson3==3.0.6 +discord-py==2.3.2 +duckduckgo-search>=7.2.0,<8.0.0 +editdistance==0.8.1 +elastic-transport==8.12.0 +elasticsearch==8.12.1 +elasticsearch-dsl==8.12.0 +filelock==3.15.4 +flask==3.0.3 +flask-cors==5.0.0 +flask-login==0.6.3 +flask-session==0.8.0 +google-search-results==2.4.2 +groq==0.9.0 hanziconv==0.3.2 -hf_transfer==0.1.9 -html-minifier==0.0.4 -httpcore==1.0.7 -httplib2==0.22.0 -httptools==0.6.4 -httpx==0.28.1 -httpx-sse==0.4.0 -huggingface-hub==0.29.3 -humanfriendly==10.0 -id==1.5.0 -idna==3.10 -ijson==3.3.0 -importlib_metadata==8.6.1 -infinity-sdk==0.6.0.dev3 -infinity_emb==0.0.75 -iniconfig==2.0.0 -inscriptis==2.5.3 -ir_datasets==0.5.10 -jaraco.classes==3.4.0 -jaraco.context==6.0.1 -jaraco.functools==4.1.0 -Jinja2==3.1.6 -jiter==0.9.0 -joblib==1.4.2 -jsmin==3.0.1 -json_repair==0.39.1 -jsonpatch==1.33 -jsonpointer==3.0.0 -keyring==25.6.0 -kiwisolver==1.4.8 -langchain==0.3.20 -langchain-community==0.3.19 -langchain-core==0.3.41 -langchain-ollama==0.2.3 -langchain-text-splitters==0.3.6 -langsmith==0.3.12 -lxml==5.3.1 -lz4==4.4.3 -markdown-it-py==3.0.0 -MarkupSafe==3.0.2 -marshmallow==3.26.1 -matplotlib==3.10.0 -mdurl==0.1.2 -monotonic==1.6 -more-itertools==10.6.0 -mpmath==1.3.0 -multidict==6.1.0 -multiprocess==0.70.16 -mypy-extensions==1.0.0 -mysql==0.0.3 -mysql-connector-python==9.2.0 -mysqlclient==2.2.7 -networkx==3.4.2 -nh3==0.2.21 +html-text==0.6.2 +httpx==0.27.0 +huggingface-hub>=0.25.0,<0.26.0 +infinity-sdk==0.6.0-dev3 +infinity-emb>=0.0.66,<0.0.67 +itsdangerous==2.1.2 +json-repair==0.35.0 +markdown==3.6 +markdown-to-json==2.1.1 +minio==7.2.4 +mistralai==0.4.2 nltk==3.9.1 -numpy==1.26.4 -ollama==0.4.7 -onnx==1.17.0 -onnxruntime==1.21.0 -openai==1.66.3 -openpyxl==3.1.5 -optimum==1.24.0 -orjson==3.10.15 -ormsgpack==1.8.0 -outcome==1.3.0.post0 -packaging==24.2 -pandas==2.2.3 -pdfminer.six==20231228 -pdfplumber==0.11.5 -peft==0.14.0 -pillow==11.1.0 -pluggy==1.5.0 -polars-lts-cpu==1.9.0 -posthog==3.20.0 -proces==0.1.7 -prometheus-fastapi-instrumentator==7.0.2 -prometheus_client==0.21.1 -propcache==0.3.0 -proto-plus==1.26.1 -protobuf==5.29.3 -psutil==7.0.0 -pyarrow==17.0.0 -pyasn1==0.6.1 -pyasn1_modules==0.4.1 -pycparser==2.22 -pycryptodome==3.21.0 +numpy>=1.26.0,<2.0.0 +ollama==0.2.1 +onnxruntime==1.19.2; sys_platform == 'darwin' or platform_machine != 'x86_64' +onnxruntime-gpu==1.19.2; sys_platform != 'darwin' and platform_machine == 'x86_64' +openai==1.45.0 +opencv-python==4.10.0.84 +opencv-python-headless==4.10.0.84 +openpyxl>=3.1.0,<4.0.0 +ormsgpack==1.5.0 +pandas>=2.2.0,<3.0.0 +pdfplumber==0.10.4 +peewee==3.17.1 +pillow==10.4.0 +protobuf==5.27.2 +psycopg2-binary==2.9.9 +pyclipper==1.3.0.post5 pycryptodomex==3.20.0 -pydantic==2.9.2 -pydantic-settings==2.8.1 -pydantic_core==2.23.4 -Pygments==2.19.1 -PyJWT==2.8.0 -PyMuPDF==1.25.3 -PyMySQL==1.1.1 -pyparsing==3.2.1 -pypdfium2==4.30.1 -pyreadline3==3.5.4 -PySocks==1.7.1 -pytest==8.3.5 -python-dateutil==2.9.0.post0 +pypdf>=5.0.0,<6.0.0 +pytest>=8.3.0,<9.0.0 python-dotenv==1.0.1 -pytz==2025.1 -pywin32-ctypes==0.2.3 -PyYAML==6.0.2 -readerwriterlock==1.0.9 -readme_renderer==44.0 -regex==2024.11.6 -requests==2.32.3 -requests-file==2.1.0 -requests-toolbelt==1.0.0 -rfc3986==2.0.0 -rich==13.9.4 +python-dateutil==2.8.2 +python-pptx>=1.0.2,<2.0.0 +pywencai==0.12.2 +qianfan==0.4.6 +ranx==0.3.20 +readability-lxml==0.8.1 +valkey==6.0.2 +requests==2.32.2 +replicate==0.31.0 roman-numbers==1.0.2 -rsa==4.9 -ruamel.yaml==0.18.10 -ruamel.yaml.clib==0.2.12 -safetensors==0.5.3 -scikit-learn==1.6.1 -scipy==1.15.2 -selenium==4.29.0 -sentence-transformers==3.4.1 -sentencepiece==0.2.0 -shellingham==1.5.4 -simplejson==3.20.1 -six==1.17.0 -sniffio==1.3.1 -sortedcontainers==2.4.0 -soundfile==0.12.1 -soupsieve==2.6 -SQLAlchemy==2.0.38 -sqlglot==11.7.1 -starlette==0.46.1 -StrEnum==0.4.15 -sympy==1.13.1 -tenacity==9.0.0 -threadpoolctl==3.6.0 -thrift==0.20.0 -tiktoken==0.9.0 -timm==1.0.15 -tldextract==5.1.3 -tokenizers==0.21.1 -tomli==2.2.1 -torch==2.6.0 -torchvision==0.21.0 -tqdm==4.67.1 -transformers==4.47.1 -trec-car-tools==2.6 -trio==0.29.0 -trio-websocket==0.12.2 -tushare==1.4.18 -twine==6.1.0 -typer==0.12.5 -typing-inspect==0.9.0 -typing_extensions==4.12.2 -tzdata==2025.1 -unlzw3==0.2.3 -uritemplate==4.1.1 -urllib3==2.3.0 -uvicorn==0.32.1 -valkey==6.1.0 -warc3-wet==0.2.5 -warc3-wet-clueweb09==0.2.5 -watchfiles==1.0.4 -webdriver-manager==4.0.2 -websocket-client==1.8.0 -websockets==15.0.1 -Werkzeug==3.1.3 +ruamel-base==1.0.0 +scholarly==1.7.11 +scikit-learn==1.5.0 +selenium==4.22.0 +selenium-wire==5.1.0 +setuptools>=75.2.0,<76.0.0 +shapely==2.0.5 +six==1.16.0 +strenum==0.4.15 +tabulate==0.9.0 +tavily-python==0.5.1 +tencentcloud-sdk-python==3.0.1215 +tika==2.6.0 +tiktoken==0.7.0 +umap_learn==0.5.6 +vertexai==1.64.0 +volcengine==1.0.146 +voyageai==0.2.3 +webdriver-manager==4.0.1 +werkzeug==3.0.6 +wikipedia==1.4.0 word2number==1.1 -wsproto==1.2.0 -xxhash==3.5.0 -yarl==1.18.3 -zhipuai==2.1.5.20250106 -zipp==3.21.0 -zlib-state==0.1.9 -zstandard==0.23.0 \ No newline at end of file +xgboost==1.5.0 +xpinyin==0.7.6 +yfinance==0.1.96 +zhipuai==2.0.1 +ruamel-yaml>=0.18.6,<0.19.0 +google-generativeai>=0.8.1,<0.9.0 +python-docx>=1.1.2,<2.0.0 +pypdf2>=3.0.1,<4.0.0 +graspologic>=3.4.1,<4.0.0 +pymysql>=1.1.1,<2.0.0 +mini-racer>=0.12.4,<0.13.0 +pyodbc>=5.2.0,<6.0.0 +pyicu>=2.13.1,<3.0.0 +flasgger>=0.9.7.1,<0.10.0 +xxhash>=3.5.0,<4.0.0 +trio>=0.29.0 \ No newline at end of file diff --git a/web/src/pages/chat/chat-configuration-modal/assistant-setting.tsx b/web/src/pages/chat/chat-configuration-modal/assistant-setting.tsx index 69a5b2c..cabbb28 100644 --- a/web/src/pages/chat/chat-configuration-modal/assistant-setting.tsx +++ b/web/src/pages/chat/chat-configuration-modal/assistant-setting.tsx @@ -95,7 +95,7 @@ const AssistantSetting = ({