mirror of
https://gitee.com/huangge1199_admin/vue-pro.git
synced 2024-11-26 17:21:53 +08:00
【解决todo】AI 知识库: 字段命名统一 补充注释
This commit is contained in:
parent
8e56b81a3a
commit
5cd870748d
@ -29,7 +29,7 @@ public class AiKnowledgeSegmentController {
|
||||
|
||||
@GetMapping("/page")
|
||||
@Operation(summary = "获取段落分页")
|
||||
public CommonResult<PageResult<AiKnowledgeSegmentRespVO>> getKnowledgeSegmentPageMy(@Valid AiKnowledgeSegmentPageReqVO pageReqVO) {
|
||||
public CommonResult<PageResult<AiKnowledgeSegmentRespVO>> getKnowledgeSegmentPage(@Valid AiKnowledgeSegmentPageReqVO pageReqVO) {
|
||||
PageResult<AiKnowledgeSegmentDO> pageResult = segmentService.getKnowledgeSegmentPage(pageReqVO);
|
||||
return success(BeanUtils.toBean(pageResult, AiKnowledgeSegmentRespVO.class));
|
||||
}
|
||||
|
@ -23,21 +23,21 @@ public class AiKnowledgeDocumentCreateReqVO {
|
||||
@URL(message = "文档 URL 格式不正确")
|
||||
private String url;
|
||||
|
||||
@Schema(description = "每个文本块的目标 token 数", requiredMode = Schema.RequiredMode.REQUIRED, example = "800")
|
||||
@NotNull(message = "每个文本块的目标 token 数不能为空")
|
||||
private Integer defaultChunkSize;
|
||||
@Schema(description = "每个段落的目标 token 数", requiredMode = Schema.RequiredMode.REQUIRED, example = "800")
|
||||
@NotNull(message = "每个段落的目标 token 数不能为空")
|
||||
private Integer defaultSegmentTokens;
|
||||
|
||||
@Schema(description = "每个文本块的最小字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "350")
|
||||
@NotNull(message = "每个文本块的最小字符数不能为空")
|
||||
private Integer minChunkSizeChars;
|
||||
@Schema(description = "每个段落的最小字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "350")
|
||||
@NotNull(message = "每个段落的最小字符数不能为空")
|
||||
private Integer minSegmentWordCount;
|
||||
|
||||
@Schema(description = "丢弃阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "5")
|
||||
@Schema(description = "丢弃阈值:低于此阈值的段落会被丢弃", requiredMode = Schema.RequiredMode.REQUIRED, example = "5")
|
||||
@NotNull(message = "丢弃阈值不能为空")
|
||||
private Integer minChunkLengthToEmbed;
|
||||
|
||||
@Schema(description = "最大块数", requiredMode = Schema.RequiredMode.REQUIRED, example = "10000")
|
||||
@NotNull(message = "最大块数不能为空")
|
||||
private Integer maxNumChunks;
|
||||
@Schema(description = "最大段落数", requiredMode = Schema.RequiredMode.REQUIRED, example = "10000")
|
||||
@NotNull(message = "最大段落数不能为空")
|
||||
private Integer maxNumSegments;
|
||||
|
||||
@Schema(description = "分块是否保留分隔符", requiredMode = Schema.RequiredMode.REQUIRED, example = "true")
|
||||
@NotNull(message = "分块是否保留分隔符不能为空")
|
||||
|
@ -38,9 +38,11 @@ public class AiKnowledgeDO extends BaseDO {
|
||||
* 知识库描述
|
||||
*/
|
||||
private String description;
|
||||
// TODO @新:如果全部可见,需要怎么设置?
|
||||
|
||||
/**
|
||||
* 可见权限,只能选择哪些人可见
|
||||
* 可见权限,选择哪些人可见
|
||||
* <p>
|
||||
* -1 所有人可见,其他为各自用户编号
|
||||
*/
|
||||
@TableField(typeHandler = JacksonTypeHandler.class)
|
||||
private List<Long> visibilityPermissions;
|
||||
|
@ -40,23 +40,25 @@ public class AiKnowledgeDocumentDO extends BaseDO {
|
||||
*/
|
||||
private String url;
|
||||
/**
|
||||
* token 数量
|
||||
* 文档 token 数量
|
||||
*/
|
||||
private Integer tokens;
|
||||
/**
|
||||
* 字符数
|
||||
* 文档字符数
|
||||
*/
|
||||
private Integer wordCount;
|
||||
// TODO @新:chunk 1)是不是 segment,这样命名保持一致会好点哈?2)Size 是不是改成 Tokens 会统一点;3)defaultChunkSize、defaultChunkSize、minChunkSizeChars、maxNumChunks 这几个字段的命名,可能要微信一起讨论下。尽量命名保持风格统一哈。
|
||||
|
||||
|
||||
// ========== 自定义分段所用参数 ==========
|
||||
// TODO @新:3)defaultChunkSize、defaultChunkSize、minChunkSizeChars、maxNumChunks 这几个字段的命名,可能要微信一起讨论下。尽量命名保持风格统一哈。
|
||||
/**
|
||||
* 每个文本块的目标 token 数
|
||||
*/
|
||||
private Integer defaultChunkSize;
|
||||
// TODO @xin:SizeChars 和 wordCount 好像是一个意思,是不是也要统一哈。
|
||||
private Integer defaultSegmentTokens;
|
||||
/**
|
||||
* 每个文本块的最小字符数
|
||||
*/
|
||||
private Integer minChunkSizeChars;
|
||||
private Integer minSegmentWordCount;
|
||||
/**
|
||||
* 低于此值的块会被丢弃
|
||||
*/
|
||||
@ -64,11 +66,13 @@ public class AiKnowledgeDocumentDO extends BaseDO {
|
||||
/**
|
||||
* 最大块数
|
||||
*/
|
||||
private Integer maxNumChunks;
|
||||
private Integer maxNumSegments;
|
||||
/**
|
||||
* 分块是否保留分隔符
|
||||
*/
|
||||
private Boolean keepSeparator;
|
||||
// ===================================
|
||||
|
||||
/**
|
||||
* 切片状态
|
||||
* <p>
|
||||
|
@ -2,8 +2,6 @@ package cn.iocoder.yudao.module.ai.dal.dataobject.knowledge;
|
||||
|
||||
import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum;
|
||||
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
|
||||
import com.baomidou.mybatisplus.annotation.FieldStrategy;
|
||||
import com.baomidou.mybatisplus.annotation.TableField;
|
||||
import com.baomidou.mybatisplus.annotation.TableId;
|
||||
import com.baomidou.mybatisplus.annotation.TableName;
|
||||
import lombok.Data;
|
||||
@ -27,7 +25,6 @@ public class AiKnowledgeSegmentDO extends BaseDO {
|
||||
/**
|
||||
* 向量库的编号
|
||||
*/
|
||||
@TableField(updateStrategy = FieldStrategy.ALWAYS) // TODO @新:尽量规避要这个注解。万一后面加个 status 单独更新,可能会踩坑。
|
||||
private String vectorId;
|
||||
/**
|
||||
* 知识库编号
|
||||
|
@ -25,8 +25,7 @@ public interface AiKnowledgeSegmentMapper extends BaseMapperX<AiKnowledgeSegment
|
||||
.orderByDesc(AiKnowledgeSegmentDO::getId));
|
||||
}
|
||||
|
||||
// TODO @新:selectListByXXX 哈
|
||||
default List<AiKnowledgeSegmentDO> selectList(List<String> vectorIdList) {
|
||||
default List<AiKnowledgeSegmentDO> selectListByVectorIds(List<String> vectorIdList) {
|
||||
return selectList(new LambdaQueryWrapperX<AiKnowledgeSegmentDO>()
|
||||
.in(AiKnowledgeSegmentDO::getVectorId, vectorIdList)
|
||||
.orderByDesc(AiKnowledgeSegmentDO::getId));
|
||||
|
@ -71,8 +71,8 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
|
||||
}
|
||||
|
||||
// 2 构造文本分段器
|
||||
TokenTextSplitter tokenTextSplitter = new TokenTextSplitter(createReqVO.getDefaultChunkSize(), createReqVO.getMinChunkSizeChars(), createReqVO.getMinChunkLengthToEmbed(),
|
||||
createReqVO.getMaxNumChunks(), createReqVO.getKeepSeparator());
|
||||
TokenTextSplitter tokenTextSplitter = new TokenTextSplitter(createReqVO.getDefaultSegmentTokens(), createReqVO.getMinSegmentWordCount(), createReqVO.getMinChunkLengthToEmbed(),
|
||||
createReqVO.getMaxNumSegments(), createReqVO.getKeepSeparator());
|
||||
// 2.1 文档分段
|
||||
List<Document> segments = tokenTextSplitter.apply(documents);
|
||||
// 2.2 分段内容入库
|
||||
|
@ -90,7 +90,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
|
||||
} else {
|
||||
// 2.2 禁用删除向量
|
||||
vectorStore.delete(List.of(oldKnowledgeSegment.getVectorId()));
|
||||
knowledgeSegment.setVectorId(null);
|
||||
knowledgeSegment.setVectorId("");
|
||||
}
|
||||
// 3 更新段落状态
|
||||
segmentMapper.updateById(knowledgeSegment);
|
||||
@ -114,7 +114,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
|
||||
return ListUtil.empty();
|
||||
}
|
||||
// 3.2 段落召回
|
||||
return segmentMapper.selectList(CollUtil.getFieldValues(documentList, "id", String.class));
|
||||
return segmentMapper.selectListByVectorIds(CollUtil.getFieldValues(documentList, "id", String.class));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -47,13 +47,12 @@ public interface AiKnowledgeService {
|
||||
*/
|
||||
PageResult<AiKnowledgeDO> getKnowledgePageMy(Long userId, PageParam pageReqVO);
|
||||
|
||||
// TODO @新:knowledgeId 和 validateKnowledgeExists 的 id 是同一个么?如果是的话,建议变量也用 id 哈,然后两边的 id 注释,保持一致
|
||||
/**
|
||||
* 根据知识库编号获取向量存储实例
|
||||
*
|
||||
* @param knowledgeId 知识库编号
|
||||
* @param id 知识库编号
|
||||
* @return 向量存储实例
|
||||
*/
|
||||
VectorStore getVectorStoreById(Long knowledgeId);
|
||||
VectorStore getVectorStoreById(Long id);
|
||||
|
||||
}
|
||||
|
@ -29,21 +29,18 @@ import static cn.iocoder.yudao.module.ai.enums.ErrorCodeConstants.KNOWLEDGE_NOT_
|
||||
@Slf4j
|
||||
public class AiKnowledgeServiceImpl implements AiKnowledgeService {
|
||||
|
||||
@Resource
|
||||
private AiChatModelService chatModalService;
|
||||
|
||||
@Resource
|
||||
private AiKnowledgeMapper knowledgeMapper;
|
||||
|
||||
@Resource
|
||||
private AiChatModelService chatModelService;
|
||||
@Resource
|
||||
private AiApiKeyService apiKeyService;
|
||||
// TODO @新:chatModelService 和 apiKeyService 可以放到 33 行的 chatModalService 后面。尽量保持,想通类型的变量在一块。例如说,Service 一块,Mapper 一块。
|
||||
|
||||
@Override
|
||||
public Long createKnowledgeMy(AiKnowledgeCreateMyReqVO createReqVO, Long userId) {
|
||||
// 1. 校验模型配置
|
||||
AiChatModelDO model = chatModalService.validateChatModel(createReqVO.getModelId());
|
||||
AiChatModelDO model = chatModelService.validateChatModel(createReqVO.getModelId());
|
||||
|
||||
// 2. 插入知识库
|
||||
AiKnowledgeDO knowledgeBase = BeanUtils.toBean(createReqVO, AiKnowledgeDO.class)
|
||||
@ -60,7 +57,7 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService {
|
||||
throw exception(KNOWLEDGE_NOT_EXISTS);
|
||||
}
|
||||
// 1.2 校验模型配置
|
||||
AiChatModelDO model = chatModalService.validateChatModel(updateReqVO.getModelId());
|
||||
AiChatModelDO model = chatModelService.validateChatModel(updateReqVO.getModelId());
|
||||
|
||||
// 2. 更新知识库
|
||||
AiKnowledgeDO updateDO = BeanUtils.toBean(updateReqVO, AiKnowledgeDO.class);
|
||||
@ -83,8 +80,8 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService {
|
||||
}
|
||||
|
||||
@Override
|
||||
public VectorStore getVectorStoreById(Long knowledgeId) {
|
||||
AiKnowledgeDO knowledge = validateKnowledgeExists(knowledgeId);
|
||||
public VectorStore getVectorStoreById(Long id) {
|
||||
AiKnowledgeDO knowledge = validateKnowledgeExists(id);
|
||||
AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId());
|
||||
// 创建或获取 VectorStore 对象
|
||||
return apiKeyService.getOrCreateVectorStore(model.getKeyId());
|
||||
|
@ -197,7 +197,6 @@ public class AiModelFactoryImpl implements AiModelFactory {
|
||||
});
|
||||
}
|
||||
|
||||
// TODO @新:貌似可以创建一个大的 VectorStore。然后搜的时候,通过 Filter.Expression 过滤对应的数据。
|
||||
@Override
|
||||
public VectorStore getOrCreateVectorStore(EmbeddingModel embeddingModel, AiPlatformEnum platform, String apiKey, String url) {
|
||||
String cacheKey = buildClientCacheKey(VectorStore.class, platform, apiKey, url);
|
||||
|
Loading…
Reference in New Issue
Block a user