【解决todo】AI 知识库: 字段命名统一 补充注释

This commit is contained in:
xiaoxin 2024-09-22 15:20:55 +08:00
parent 8e56b81a3a
commit 5cd870748d
11 changed files with 38 additions and 41 deletions

View File

@ -29,7 +29,7 @@ public class AiKnowledgeSegmentController {
@GetMapping("/page") @GetMapping("/page")
@Operation(summary = "获取段落分页") @Operation(summary = "获取段落分页")
public CommonResult<PageResult<AiKnowledgeSegmentRespVO>> getKnowledgeSegmentPageMy(@Valid AiKnowledgeSegmentPageReqVO pageReqVO) { public CommonResult<PageResult<AiKnowledgeSegmentRespVO>> getKnowledgeSegmentPage(@Valid AiKnowledgeSegmentPageReqVO pageReqVO) {
PageResult<AiKnowledgeSegmentDO> pageResult = segmentService.getKnowledgeSegmentPage(pageReqVO); PageResult<AiKnowledgeSegmentDO> pageResult = segmentService.getKnowledgeSegmentPage(pageReqVO);
return success(BeanUtils.toBean(pageResult, AiKnowledgeSegmentRespVO.class)); return success(BeanUtils.toBean(pageResult, AiKnowledgeSegmentRespVO.class));
} }

View File

@ -23,21 +23,21 @@ public class AiKnowledgeDocumentCreateReqVO {
@URL(message = "文档 URL 格式不正确") @URL(message = "文档 URL 格式不正确")
private String url; private String url;
@Schema(description = "每个文本块的目标 token 数", requiredMode = Schema.RequiredMode.REQUIRED, example = "800") @Schema(description = "每个段落的目标 token 数", requiredMode = Schema.RequiredMode.REQUIRED, example = "800")
@NotNull(message = "每个文本块的目标 token 数不能为空") @NotNull(message = "每个段落的目标 token 数不能为空")
private Integer defaultChunkSize; private Integer defaultSegmentTokens;
@Schema(description = "每个文本块的最小字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "350") @Schema(description = "每个段落的最小字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "350")
@NotNull(message = "每个文本块的最小字符数不能为空") @NotNull(message = "每个段落的最小字符数不能为空")
private Integer minChunkSizeChars; private Integer minSegmentWordCount;
@Schema(description = "丢弃阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "5") @Schema(description = "丢弃阈值:低于此阈值的段落会被丢弃", requiredMode = Schema.RequiredMode.REQUIRED, example = "5")
@NotNull(message = "丢弃阈值不能为空") @NotNull(message = "丢弃阈值不能为空")
private Integer minChunkLengthToEmbed; private Integer minChunkLengthToEmbed;
@Schema(description = "最大", requiredMode = Schema.RequiredMode.REQUIRED, example = "10000") @Schema(description = "最大段落", requiredMode = Schema.RequiredMode.REQUIRED, example = "10000")
@NotNull(message = "最大数不能为空") @NotNull(message = "最大段落数不能为空")
private Integer maxNumChunks; private Integer maxNumSegments;
@Schema(description = "分块是否保留分隔符", requiredMode = Schema.RequiredMode.REQUIRED, example = "true") @Schema(description = "分块是否保留分隔符", requiredMode = Schema.RequiredMode.REQUIRED, example = "true")
@NotNull(message = "分块是否保留分隔符不能为空") @NotNull(message = "分块是否保留分隔符不能为空")

View File

@ -38,9 +38,11 @@ public class AiKnowledgeDO extends BaseDO {
* 知识库描述 * 知识库描述
*/ */
private String description; private String description;
// TODO @新如果全部可见需要怎么设置
/** /**
* 可见权限,只能选择哪些人可见 * 可见权限,选择哪些人可见
* <p>
* -1 所有人可见其他为各自用户编号
*/ */
@TableField(typeHandler = JacksonTypeHandler.class) @TableField(typeHandler = JacksonTypeHandler.class)
private List<Long> visibilityPermissions; private List<Long> visibilityPermissions;

View File

@ -40,23 +40,25 @@ public class AiKnowledgeDocumentDO extends BaseDO {
*/ */
private String url; private String url;
/** /**
* token 数量 * 文档 token 数量
*/ */
private Integer tokens; private Integer tokens;
/** /**
* 字符数 * 文档字符数
*/ */
private Integer wordCount; private Integer wordCount;
// TODO @新chunk 1是不是 segment这样命名保持一致会好点哈2Size 是不是改成 Tokens 会统一点3defaultChunkSizedefaultChunkSizeminChunkSizeCharsmaxNumChunks 这几个字段的命名可能要微信一起讨论下尽量命名保持风格统一哈
// ========== 自定义分段所用参数 ==========
// TODO @新3defaultChunkSizedefaultChunkSizeminChunkSizeCharsmaxNumChunks 这几个字段的命名可能要微信一起讨论下尽量命名保持风格统一哈
/** /**
* 每个文本块的目标 token * 每个文本块的目标 token
*/ */
private Integer defaultChunkSize; private Integer defaultSegmentTokens;
// TODO @xinSizeChars wordCount 好像是一个意思是不是也要统一哈
/** /**
* 每个文本块的最小字符数 * 每个文本块的最小字符数
*/ */
private Integer minChunkSizeChars; private Integer minSegmentWordCount;
/** /**
* 低于此值的块会被丢弃 * 低于此值的块会被丢弃
*/ */
@ -64,11 +66,13 @@ public class AiKnowledgeDocumentDO extends BaseDO {
/** /**
* 最大块数 * 最大块数
*/ */
private Integer maxNumChunks; private Integer maxNumSegments;
/** /**
* 分块是否保留分隔符 * 分块是否保留分隔符
*/ */
private Boolean keepSeparator; private Boolean keepSeparator;
// ===================================
/** /**
* 切片状态 * 切片状态
* <p> * <p>

View File

@ -2,8 +2,6 @@ package cn.iocoder.yudao.module.ai.dal.dataobject.knowledge;
import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum; import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
import com.baomidou.mybatisplus.annotation.FieldStrategy;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableId; import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName; import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data; import lombok.Data;
@ -27,7 +25,6 @@ public class AiKnowledgeSegmentDO extends BaseDO {
/** /**
* 向量库的编号 * 向量库的编号
*/ */
@TableField(updateStrategy = FieldStrategy.ALWAYS) // TODO @新尽量规避要这个注解万一后面加个 status 单独更新可能会踩坑
private String vectorId; private String vectorId;
/** /**
* 知识库编号 * 知识库编号

View File

@ -25,8 +25,7 @@ public interface AiKnowledgeSegmentMapper extends BaseMapperX<AiKnowledgeSegment
.orderByDesc(AiKnowledgeSegmentDO::getId)); .orderByDesc(AiKnowledgeSegmentDO::getId));
} }
// TODO @新selectListByXXX default List<AiKnowledgeSegmentDO> selectListByVectorIds(List<String> vectorIdList) {
default List<AiKnowledgeSegmentDO> selectList(List<String> vectorIdList) {
return selectList(new LambdaQueryWrapperX<AiKnowledgeSegmentDO>() return selectList(new LambdaQueryWrapperX<AiKnowledgeSegmentDO>()
.in(AiKnowledgeSegmentDO::getVectorId, vectorIdList) .in(AiKnowledgeSegmentDO::getVectorId, vectorIdList)
.orderByDesc(AiKnowledgeSegmentDO::getId)); .orderByDesc(AiKnowledgeSegmentDO::getId));

View File

@ -71,8 +71,8 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
} }
// 2 构造文本分段器 // 2 构造文本分段器
TokenTextSplitter tokenTextSplitter = new TokenTextSplitter(createReqVO.getDefaultChunkSize(), createReqVO.getMinChunkSizeChars(), createReqVO.getMinChunkLengthToEmbed(), TokenTextSplitter tokenTextSplitter = new TokenTextSplitter(createReqVO.getDefaultSegmentTokens(), createReqVO.getMinSegmentWordCount(), createReqVO.getMinChunkLengthToEmbed(),
createReqVO.getMaxNumChunks(), createReqVO.getKeepSeparator()); createReqVO.getMaxNumSegments(), createReqVO.getKeepSeparator());
// 2.1 文档分段 // 2.1 文档分段
List<Document> segments = tokenTextSplitter.apply(documents); List<Document> segments = tokenTextSplitter.apply(documents);
// 2.2 分段内容入库 // 2.2 分段内容入库

View File

@ -90,7 +90,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
} else { } else {
// 2.2 禁用删除向量 // 2.2 禁用删除向量
vectorStore.delete(List.of(oldKnowledgeSegment.getVectorId())); vectorStore.delete(List.of(oldKnowledgeSegment.getVectorId()));
knowledgeSegment.setVectorId(null); knowledgeSegment.setVectorId("");
} }
// 3 更新段落状态 // 3 更新段落状态
segmentMapper.updateById(knowledgeSegment); segmentMapper.updateById(knowledgeSegment);
@ -114,7 +114,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
return ListUtil.empty(); return ListUtil.empty();
} }
// 3.2 段落召回 // 3.2 段落召回
return segmentMapper.selectList(CollUtil.getFieldValues(documentList, "id", String.class)); return segmentMapper.selectListByVectorIds(CollUtil.getFieldValues(documentList, "id", String.class));
} }
/** /**

View File

@ -47,13 +47,12 @@ public interface AiKnowledgeService {
*/ */
PageResult<AiKnowledgeDO> getKnowledgePageMy(Long userId, PageParam pageReqVO); PageResult<AiKnowledgeDO> getKnowledgePageMy(Long userId, PageParam pageReqVO);
// TODO @新knowledgeId validateKnowledgeExists id 是同一个么如果是的话建议变量也用 id 然后两边的 id 注释保持一致
/** /**
* 根据知识库编号获取向量存储实例 * 根据知识库编号获取向量存储实例
* *
* @param knowledgeId 知识库编号 * @param id 知识库编号
* @return 向量存储实例 * @return 向量存储实例
*/ */
VectorStore getVectorStoreById(Long knowledgeId); VectorStore getVectorStoreById(Long id);
} }

View File

@ -29,21 +29,18 @@ import static cn.iocoder.yudao.module.ai.enums.ErrorCodeConstants.KNOWLEDGE_NOT_
@Slf4j @Slf4j
public class AiKnowledgeServiceImpl implements AiKnowledgeService { public class AiKnowledgeServiceImpl implements AiKnowledgeService {
@Resource
private AiChatModelService chatModalService;
@Resource @Resource
private AiKnowledgeMapper knowledgeMapper; private AiKnowledgeMapper knowledgeMapper;
@Resource @Resource
private AiChatModelService chatModelService; private AiChatModelService chatModelService;
@Resource @Resource
private AiApiKeyService apiKeyService; private AiApiKeyService apiKeyService;
// TODO @新chatModelService apiKeyService 可以放到 33 行的 chatModalService 后面尽量保持想通类型的变量在一块例如说Service 一块Mapper 一块
@Override @Override
public Long createKnowledgeMy(AiKnowledgeCreateMyReqVO createReqVO, Long userId) { public Long createKnowledgeMy(AiKnowledgeCreateMyReqVO createReqVO, Long userId) {
// 1. 校验模型配置 // 1. 校验模型配置
AiChatModelDO model = chatModalService.validateChatModel(createReqVO.getModelId()); AiChatModelDO model = chatModelService.validateChatModel(createReqVO.getModelId());
// 2. 插入知识库 // 2. 插入知识库
AiKnowledgeDO knowledgeBase = BeanUtils.toBean(createReqVO, AiKnowledgeDO.class) AiKnowledgeDO knowledgeBase = BeanUtils.toBean(createReqVO, AiKnowledgeDO.class)
@ -60,7 +57,7 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService {
throw exception(KNOWLEDGE_NOT_EXISTS); throw exception(KNOWLEDGE_NOT_EXISTS);
} }
// 1.2 校验模型配置 // 1.2 校验模型配置
AiChatModelDO model = chatModalService.validateChatModel(updateReqVO.getModelId()); AiChatModelDO model = chatModelService.validateChatModel(updateReqVO.getModelId());
// 2. 更新知识库 // 2. 更新知识库
AiKnowledgeDO updateDO = BeanUtils.toBean(updateReqVO, AiKnowledgeDO.class); AiKnowledgeDO updateDO = BeanUtils.toBean(updateReqVO, AiKnowledgeDO.class);
@ -83,8 +80,8 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService {
} }
@Override @Override
public VectorStore getVectorStoreById(Long knowledgeId) { public VectorStore getVectorStoreById(Long id) {
AiKnowledgeDO knowledge = validateKnowledgeExists(knowledgeId); AiKnowledgeDO knowledge = validateKnowledgeExists(id);
AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId()); AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId());
// 创建或获取 VectorStore 对象 // 创建或获取 VectorStore 对象
return apiKeyService.getOrCreateVectorStore(model.getKeyId()); return apiKeyService.getOrCreateVectorStore(model.getKeyId());

View File

@ -197,7 +197,6 @@ public class AiModelFactoryImpl implements AiModelFactory {
}); });
} }
// TODO @新貌似可以创建一个大的 VectorStore然后搜的时候通过 Filter.Expression 过滤对应的数据
@Override @Override
public VectorStore getOrCreateVectorStore(EmbeddingModel embeddingModel, AiPlatformEnum platform, String apiKey, String url) { public VectorStore getOrCreateVectorStore(EmbeddingModel embeddingModel, AiPlatformEnum platform, String apiKey, String url) {
String cacheKey = buildClientCacheKey(VectorStore.class, platform, apiKey, url); String cacheKey = buildClientCacheKey(VectorStore.class, platform, apiKey, url);