【新增】AI 知识库: 配置自定义、段落启禁用

This commit is contained in:
xiaoxin 2024-09-05 17:11:56 +08:00
parent 92d32b652e
commit 9b8136ef30
9 changed files with 149 additions and 27 deletions

View File

@ -25,4 +25,11 @@ public class AiKnowledgeCreateMyReqVO {
@NotNull(message = "嵌入模型不能为空")
private Long modelId;
@Schema(description = "相似性阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "0.5")
@NotNull(message = "相似性阈值不能为空")
private Double similarityThreshold;
@Schema(description = "topK", requiredMode = Schema.RequiredMode.REQUIRED, example = "3")
@NotNull(message = "topK 不能为空")
private Integer topK;
}

View File

@ -23,4 +23,23 @@ public class AiKnowledgeDocumentCreateReqVO {
@URL(message = "文档 URL 格式不正确")
private String url;
@Schema(description = "每个文本块的目标 token 数", requiredMode = Schema.RequiredMode.REQUIRED, example = "800")
@NotNull(message = "每个文本块的目标 token 数不能为空")
private Integer defaultChunkSize;
@Schema(description = "每个文本块的最小字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "350")
@NotNull(message = "每个文本块的最小字符数不能为空")
private Integer minChunkSizeChars;
@Schema(description = "丢弃阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "5")
@NotNull(message = "丢弃阈值不能为空")
private Integer minChunkLengthToEmbed;
@Schema(description = "最大块数", requiredMode = Schema.RequiredMode.REQUIRED, example = "10000")
@NotNull(message = "最大块数不能为空")
private Integer maxNumChunks;
@Schema(description = "分块是否保留分隔符", requiredMode = Schema.RequiredMode.REQUIRED, example = "true")
@NotNull(message = "分块是否保留分隔符不能为空")
private Boolean keepSeparator;
}

View File

@ -52,6 +52,18 @@ public class AiKnowledgeDO extends BaseDO {
* 模型标识
*/
private String model;
/**
* topK
*/
private Integer topK;
/**
* 相似度阈值
*/
private Double similarityThreshold;
/**
* 状态
* <p>

View File

@ -23,7 +23,7 @@ public class AiKnowledgeDocumentDO extends BaseDO {
private Long id;
/**
* 知识库编号
*
* <p>
* 关联 {@link AiKnowledgeDO#getId()}
*/
private Long knowledgeId;
@ -47,6 +47,26 @@ public class AiKnowledgeDocumentDO extends BaseDO {
* 字符数
*/
private Integer wordCount;
/**
* 每个文本块的目标 token
*/
private Integer defaultChunkSize;
/**
* 每个文本块的最小字符数
*/
private Integer minChunkSizeChars;
/**
* 低于此值的块会被丢弃
*/
private Integer minChunkLengthToEmbed;
/**
* 最大块数
*/
private Integer maxNumChunks;
/**
* 分块是否保留分隔符
*/
private Boolean keepSeparator;
/**
* 切片状态
* <p>

View File

@ -2,6 +2,8 @@ package cn.iocoder.yudao.module.ai.dal.dataobject.knowledge;
import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum;
import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO;
import com.baomidou.mybatisplus.annotation.FieldStrategy;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
@ -25,16 +27,17 @@ public class AiKnowledgeSegmentDO extends BaseDO {
/**
* 向量库的编号
*/
@TableField(updateStrategy = FieldStrategy.ALWAYS)
private String vectorId;
/**
* 知识库编号
*
* <p>
* 关联 {@link AiKnowledgeDO#getId()}
*/
private Long knowledgeId;
/**
* 文档编号
*
* <p>
* 关联 {@link AiKnowledgeDocumentDO#getId()}
*/
private Long documentId;
@ -52,7 +55,7 @@ public class AiKnowledgeSegmentDO extends BaseDO {
private Integer tokens;
/**
* 状态
*
* <p>
* 枚举 {@link CommonStatusEnum}
*/
private Integer status;

View File

@ -9,15 +9,11 @@ import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.document.AiKnowledgeDocumentPageReqVO;
import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.document.AiKnowledgeDocumentUpdateReqVO;
import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeDocumentCreateReqVO;
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO;
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDocumentDO;
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeSegmentDO;
import cn.iocoder.yudao.module.ai.dal.dataobject.model.AiChatModelDO;
import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeDocumentMapper;
import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeSegmentMapper;
import cn.iocoder.yudao.module.ai.enums.knowledge.AiKnowledgeDocumentStatusEnum;
import cn.iocoder.yudao.module.ai.service.model.AiApiKeyService;
import cn.iocoder.yudao.module.ai.service.model.AiChatModelService;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.document.Document;
@ -48,24 +44,16 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
@Resource
private AiKnowledgeSegmentMapper segmentMapper;
@Resource
private TokenTextSplitter tokenTextSplitter;
@Resource
private TokenCountEstimator tokenCountEstimator;
@Resource
private AiApiKeyService apiKeyService;
@Resource
private AiKnowledgeService knowledgeService;
@Resource
private AiChatModelService chatModelService;
@Override
@Transactional(rollbackFor = Exception.class)
public Long createKnowledgeDocument(AiKnowledgeDocumentCreateReqVO createReqVO) {
// 0. 校验
AiKnowledgeDO knowledge = knowledgeService.validateKnowledgeExists(createReqVO.getKnowledgeId());
AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId());
// 0. 校验并获取向量存储实例
VectorStore vectorStore = knowledgeService.getVectorStoreById(createReqVO.getKnowledgeId());
// 1.1 下载文档
TikaDocumentReader loader = new TikaDocumentReader(downloadFile(createReqVO.getUrl()));
@ -82,6 +70,9 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
return documentId;
}
// 2 构造文本分段器
TokenTextSplitter tokenTextSplitter = new TokenTextSplitter(createReqVO.getDefaultChunkSize(), createReqVO.getMinChunkSizeChars(), createReqVO.getMinChunkLengthToEmbed(),
createReqVO.getMaxNumChunks(), createReqVO.getKeepSeparator());
// 2.1 文档分段
List<Document> segments = tokenTextSplitter.apply(documents);
// 2.2 分段内容入库
@ -92,8 +83,6 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
.setStatus(CommonStatusEnum.ENABLE.getStatus()));
segmentMapper.insertBatch(segmentDOList);
// 3.1 获取向量存储实例
VectorStore vectorStore = apiKeyService.getOrCreateVectorStore(model.getKeyId());
// 3.2 向量化并存储
segments.forEach(segment -> segment.getMetadata().put(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, createReqVO.getKnowledgeId()));
vectorStore.add(segments);

View File

@ -2,6 +2,7 @@ package cn.iocoder.yudao.module.ai.service.knowledge;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.ListUtil;
import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.segment.AiKnowledgeSegmentPageReqVO;
@ -23,6 +24,10 @@ import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder;
import org.springframework.stereotype.Service;
import java.util.List;
import java.util.Objects;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
import static cn.iocoder.yudao.module.ai.enums.ErrorCodeConstants.KNOWLEDGE_SEGMENT_NOT_EXISTS;
/**
* AI 知识库分片 Service 实现类
@ -50,14 +55,45 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
@Override
public void updateKnowledgeSegment(AiKnowledgeSegmentUpdateReqVO reqVO) {
segmentMapper.updateById(BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class));
// TODO @xin 重新向量化
// 0 校验
AiKnowledgeSegmentDO oldKnowledgeSegment = validateKnowledgeSegmentExists(reqVO.getId());
// 2.1 获取知识库向量实例
VectorStore vectorStore = knowledgeService.getVectorStoreById(oldKnowledgeSegment.getKnowledgeId());
// 2.2 删除原向量
vectorStore.delete(List.of(oldKnowledgeSegment.getVectorId()));
// 2.3 重新向量化
Document document = new Document(reqVO.getContent());
document.getMetadata().put(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, oldKnowledgeSegment.getKnowledgeId());
vectorStore.add(List.of(document));
// 2.1 更新段落内容
AiKnowledgeSegmentDO knowledgeSegment = BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class);
knowledgeSegment.setVectorId(document.getId());
segmentMapper.updateById(knowledgeSegment);
}
@Override
public void updateKnowledgeSegmentStatus(AiKnowledgeSegmentUpdateStatusReqVO reqVO) {
segmentMapper.updateById(BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class));
// TODO @xin 1.禁用删除向量 2.启用重新向量化
// 0 校验
AiKnowledgeSegmentDO oldKnowledgeSegment = validateKnowledgeSegmentExists(reqVO.getId());
// 1 获取知识库向量实例
VectorStore vectorStore = knowledgeService.getVectorStoreById(oldKnowledgeSegment.getKnowledgeId());
AiKnowledgeSegmentDO knowledgeSegment = BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class);
if (Objects.equals(reqVO.getStatus(), CommonStatusEnum.ENABLE.getStatus())) {
// 2.1 启用重新向量化
Document document = new Document(oldKnowledgeSegment.getContent());
document.getMetadata().put(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, oldKnowledgeSegment.getKnowledgeId());
vectorStore.add(List.of(document));
knowledgeSegment.setVectorId(document.getId());
} else {
// 2.2 禁用删除向量
vectorStore.delete(List.of(oldKnowledgeSegment.getVectorId()));
knowledgeSegment.setVectorId(null);
}
// 3 更新段落状态
segmentMapper.updateById(knowledgeSegment);
}
@Override
@ -71,9 +107,8 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
// 1.2 向量检索
List<Document> documentList = vectorStore.similaritySearch(SearchRequest.query(reqVO.getContent())
//TODO @xin 配置提取
.withTopK(5)
.withSimilarityThreshold(0.5d)
.withTopK(knowledge.getTopK())
.withSimilarityThreshold(knowledge.getSimilarityThreshold())
.withFilterExpression(new FilterExpressionBuilder().eq(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, reqVO.getKnowledgeId()).build()));
if (CollUtil.isEmpty(documentList)) {
return ListUtil.empty();
@ -81,4 +116,19 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
// 2.1 段落召回
return segmentMapper.selectList(CollUtil.getFieldValues(documentList, "id", String.class));
}
/**
* 校验段落是否存在
*
* @param id 文档编号
* @return 段落信息
*/
private AiKnowledgeSegmentDO validateKnowledgeSegmentExists(Long id) {
AiKnowledgeSegmentDO knowledgeSegment = segmentMapper.selectById(id);
if (knowledgeSegment == null) {
throw exception(KNOWLEDGE_SEGMENT_NOT_EXISTS);
}
return knowledgeSegment;
}
}

View File

@ -5,6 +5,7 @@ import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeCreateMyReqVO;
import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeUpdateMyReqVO;
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO;
import org.springframework.ai.vectorstore.VectorStore;
/**
* AI 知识库-基础信息 Service 接口
@ -47,4 +48,12 @@ public interface AiKnowledgeService {
* @return 知识库分页
*/
PageResult<AiKnowledgeDO> getKnowledgePageMy(Long userId, PageParam pageReqVO);
/**
* 根据知识库编号获取向量存储实例
*
* @param knowledgeId 知识库编号
* @return 向量存储实例
*/
VectorStore getVectorStoreById(Long knowledgeId);
}

View File

@ -10,9 +10,11 @@ import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnow
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO;
import cn.iocoder.yudao.module.ai.dal.dataobject.model.AiChatModelDO;
import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeMapper;
import cn.iocoder.yudao.module.ai.service.model.AiApiKeyService;
import cn.iocoder.yudao.module.ai.service.model.AiChatModelService;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.stereotype.Service;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
@ -32,6 +34,10 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService {
@Resource
private AiKnowledgeMapper knowledgeMapper;
@Resource
private AiChatModelService chatModelService;
@Resource
private AiApiKeyService apiKeyService;
@Override
public Long createKnowledgeMy(AiKnowledgeCreateMyReqVO createReqVO, Long userId) {
@ -75,4 +81,11 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService {
return knowledgeMapper.selectPageByMy(userId, pageReqVO);
}
@Override
public VectorStore getVectorStoreById(Long knowledgeId) {
AiKnowledgeDO knowledge = validateKnowledgeExists(knowledgeId);
AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId());
return apiKeyService.getOrCreateVectorStore(model.getKeyId());
}
}