RAGflow/api/db/services/knowledgebase_service.py

302 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from api.db import StatusEnum, TenantPermission
from api.db.db_models import Knowledgebase, DB, Tenant, User, UserTenant,Document
from api.db.services.common_service import CommonService
from peewee import fn
class KnowledgebaseService(CommonService):
model = Knowledgebase
@classmethod
@DB.connection_context()
def is_parsed_done(cls, kb_id):
"""
Check if all documents in the knowledge base have completed parsing
Args:
kb_id: Knowledge base ID
Returns:
If all documents are parsed successfully, returns (True, None)
If any document is not fully parsed, returns (False, error_message)
"""
from api.db import TaskStatus
from api.db.services.document_service import DocumentService
# Get knowledge base information
kbs = cls.query(id=kb_id)
if not kbs:
return False, "Knowledge base not found"
kb = kbs[0]
# Get all documents in the knowledge base
docs, _ = DocumentService.get_by_kb_id(kb_id, 1, 1000, "create_time", True, "")
# Check parsing status of each document
for doc in docs:
# If document is being parsed, don't allow chat creation
if doc['run'] == TaskStatus.RUNNING.value or doc['run'] == TaskStatus.CANCEL.value or doc['run'] == TaskStatus.FAIL.value:
return False, f"Document '{doc['name']}' in dataset '{kb.name}' is still being parsed. Please wait until all documents are parsed before starting a chat."
# If document is not yet parsed and has no chunks, don't allow chat creation
if doc['run'] == TaskStatus.UNSTART.value and doc['chunk_num'] == 0:
return False, f"Document '{doc['name']}' in dataset '{kb.name}' has not been parsed yet. Please parse all documents before starting a chat."
return True, None
@classmethod
@DB.connection_context()
def list_documents_by_ids(cls,kb_ids):
doc_ids=cls.model.select(Document.id.alias("document_id")).join(Document,on=(cls.model.id == Document.kb_id)).where(
cls.model.id.in_(kb_ids)
)
doc_ids =list(doc_ids.dicts())
doc_ids = [doc["document_id"] for doc in doc_ids]
return doc_ids
@classmethod
@DB.connection_context()
def get_by_tenant_ids(cls, joined_tenant_ids, user_id,
page_number, items_per_page,
orderby, desc, keywords,
parser_id=None
):
"""
根据租户ID列表获取知识库记录的方法。
参数:
joined_tenant_ids: 列表包含用户已加入的租户ID。
user_id: 用户ID用于查询特定用户的知识库。
page_number: 分页页码,用于分页显示知识库记录。
items_per_page: 每页显示的记录数。
orderby: 字符串,指定排序字段。
desc: 布尔值,指示是否按降序排序。
keywords: 字符串,用于模糊搜索知识库名称。
parser_id: 可选参数指定解析器ID以过滤知识库记录。
返回值:
返回两个元素的元组,第一个元素是知识库记录的字典列表,第二个元素是知识库记录的总数。
"""
# 定义需要查询的字段
fields = [
cls.model.id, # 知识库ID
cls.model.avatar, # 知识库头像
cls.model.name, # 知识库名称
cls.model.language, # 知识库语言
cls.model.description, # 知识库描述
cls.model.permission, # 知识库权限
cls.model.doc_num, # 文档数量
cls.model.token_num, # 令牌数量
cls.model.chunk_num, # 块数量
cls.model.parser_id, # 解析器ID
cls.model.embd_id, # 嵌入ID
User.nickname, # 租户昵称
User.avatar.alias('tenant_avatar'), # 租户头像,使用别名以便区分
cls.model.update_time # 更新时间
]
# 如果提供了关键词,则进行模糊搜索
if keywords:
kbs = cls.model.select(*fields).join(User, on=(cls.model.tenant_id == User.id)).where(
# 筛选条件租户ID在列表中且权限为团队或租户ID等于用户ID
((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
TenantPermission.TEAM.value)) | (
cls.model.tenant_id == user_id))
# 状态必须为有效
& (cls.model.status == StatusEnum.VALID.value),
# 名称包含关键词(不区分大小写)
(fn.LOWER(cls.model.name).contains(keywords.lower()))
)
else:
# 如果没有提供关键词,则不进行搜索,只筛选符合条件的知识库
kbs = cls.model.select(*fields).join(User, on=(cls.model.tenant_id == User.id)).where(
((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
TenantPermission.TEAM.value)) | (
cls.model.tenant_id == user_id))
& (cls.model.status == StatusEnum.VALID.value)
)
# 如果提供了解析器ID则进一步过滤知识库记录
if parser_id:
kbs = kbs.where(cls.model.parser_id == parser_id)
# 根据desc参数决定排序方式
if desc:
kbs = kbs.order_by(cls.model.getter_by(orderby).desc()) # 按降序排序
else:
kbs = kbs.order_by(cls.model.getter_by(orderby).asc()) # 按升序排序
# 获取知识库记录总数
count = kbs.count()
# 对查询结果进行分页
kbs = kbs.paginate(page_number, items_per_page)
# 将查询结果转换为字典列表并返回,同时返回记录总数
return list(kbs.dicts()), count
@classmethod
@DB.connection_context()
def get_kb_ids(cls, tenant_id):
fields = [
cls.model.id,
]
kbs = cls.model.select(*fields).where(cls.model.tenant_id == tenant_id)
kb_ids = [kb.id for kb in kbs]
return kb_ids
@classmethod
@DB.connection_context()
def get_detail(cls, kb_id):
fields = [
cls.model.id,
# Tenant.embd_id,
cls.model.embd_id,
cls.model.avatar,
cls.model.name,
cls.model.language,
cls.model.description,
cls.model.permission,
cls.model.doc_num,
cls.model.token_num,
cls.model.chunk_num,
cls.model.parser_id,
cls.model.parser_config,
cls.model.pagerank]
kbs = cls.model.select(*fields).join(Tenant, on=(
(Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where(
(cls.model.id == kb_id),
(cls.model.status == StatusEnum.VALID.value)
)
if not kbs:
return
d = kbs[0].to_dict()
# d["embd_id"] = kbs[0].tenant.embd_id
return d
@classmethod
@DB.connection_context()
def update_parser_config(cls, id, config):
e, m = cls.get_by_id(id)
if not e:
raise LookupError(f"knowledgebase({id}) not found.")
def dfs_update(old, new):
for k, v in new.items():
if k not in old:
old[k] = v
continue
if isinstance(v, dict):
assert isinstance(old[k], dict)
dfs_update(old[k], v)
elif isinstance(v, list):
assert isinstance(old[k], list)
old[k] = list(set(old[k] + v))
else:
old[k] = v
dfs_update(m.parser_config, config)
cls.update_by_id(id, {"parser_config": m.parser_config})
@classmethod
@DB.connection_context()
def get_field_map(cls, ids):
conf = {}
for k in cls.get_by_ids(ids):
if k.parser_config and "field_map" in k.parser_config:
conf.update(k.parser_config["field_map"])
return conf
@classmethod
@DB.connection_context()
def get_by_name(cls, kb_name, tenant_id):
kb = cls.model.select().where(
(cls.model.name == kb_name)
& (cls.model.tenant_id == tenant_id)
& (cls.model.status == StatusEnum.VALID.value)
)
if kb:
return True, kb[0]
return False, None
@classmethod
@DB.connection_context()
def get_all_ids(cls):
return [m["id"] for m in cls.model.select(cls.model.id).dicts()]
@classmethod
@DB.connection_context()
def get_list(cls, joined_tenant_ids, user_id,
page_number, items_per_page, orderby, desc, id, name):
kbs = cls.model.select()
if id:
kbs = kbs.where(cls.model.id == id)
if name:
kbs = kbs.where(cls.model.name == name)
kbs = kbs.where(
((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
TenantPermission.TEAM.value)) | (
cls.model.tenant_id == user_id))
& (cls.model.status == StatusEnum.VALID.value)
)
if desc:
kbs = kbs.order_by(cls.model.getter_by(orderby).desc())
else:
kbs = kbs.order_by(cls.model.getter_by(orderby).asc())
kbs = kbs.paginate(page_number, items_per_page)
return list(kbs.dicts())
@classmethod
@DB.connection_context()
def accessible(cls, kb_id, user_id):
docs = cls.model.select(
cls.model.id).join(UserTenant, on=(UserTenant.tenant_id == Knowledgebase.tenant_id)
).where(cls.model.id == kb_id, UserTenant.user_id == user_id).paginate(0, 1)
docs = docs.dicts()
if not docs:
return False
return True
@classmethod
@DB.connection_context()
def get_kb_by_id(cls, kb_id, user_id):
kbs = cls.model.select().join(UserTenant, on=(UserTenant.tenant_id == Knowledgebase.tenant_id)
).where(cls.model.id == kb_id, UserTenant.user_id == user_id).paginate(0, 1)
kbs = kbs.dicts()
return list(kbs)
@classmethod
@DB.connection_context()
def get_kb_by_name(cls, kb_name, user_id):
kbs = cls.model.select().join(UserTenant, on=(UserTenant.tenant_id == Knowledgebase.tenant_id)
).where(cls.model.name == kb_name, UserTenant.user_id == user_id).paginate(0, 1)
kbs = kbs.dicts()
return list(kbs)
@classmethod
@DB.connection_context()
def accessible4deletion(cls, kb_id, user_id):
docs = cls.model.select(
cls.model.id).where(cls.model.id == kb_id, cls.model.created_by == user_id).paginate(0, 1)
docs = docs.dicts()
if not docs:
return False
return True