【新增】AI 知识库:文档 token、字符数计算

This commit is contained in:
xiaoxin 2024-08-15 16:50:16 +08:00
parent d3a4c3c718
commit 238f603f69
3 changed files with 21 additions and 13 deletions

View File

@ -21,7 +21,7 @@ public interface AiEmbeddingService {
/** /**
* 相似查询 * 相似查询
* *
* @param content 查询内容 * @param request 查询实体
*/ */
List<Document> similaritySearch(SearchRequest request); List<Document> similaritySearch(SearchRequest request);
} }

View File

@ -4,7 +4,6 @@ import jakarta.annotation.Resource;
import org.springframework.ai.document.Document; import org.springframework.ai.document.Document;
import org.springframework.ai.vectorstore.RedisVectorStore; import org.springframework.ai.vectorstore.RedisVectorStore;
import org.springframework.ai.vectorstore.SearchRequest; import org.springframework.ai.vectorstore.SearchRequest;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import java.util.List; import java.util.List;
@ -21,6 +20,8 @@ public class AiEmbeddingServiceImpl implements AiEmbeddingService {
private RedisVectorStore vectorStore; private RedisVectorStore vectorStore;
@Override @Override
// @Async
// TODO xiaoxin 报错先注释
public void add(List<Document> documents) { public void add(List<Document> documents) {
vectorStore.add(documents); vectorStore.add(documents);
} }

View File

@ -14,12 +14,14 @@ import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.document.Document; import org.springframework.ai.document.Document;
import org.springframework.ai.reader.tika.TikaDocumentReader; import org.springframework.ai.reader.tika.TikaDocumentReader;
import org.springframework.ai.tokenizer.JTokkitTokenCountEstimator;
import org.springframework.ai.transformer.splitter.TokenTextSplitter; import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional; import org.springframework.transaction.annotation.Transactional;
import java.util.List; import java.util.List;
import java.util.Objects;
/** /**
* AI 知识库-文档 Service 实现类 * AI 知识库-文档 Service 实现类
@ -41,7 +43,9 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
@Resource @Resource
private AiEmbeddingService embeddingService; private AiEmbeddingService embeddingService;
// TODO @xin 临时测试用后续删 private static final JTokkitTokenCountEstimator TOKEN_COUNT_ESTIMATOR = new JTokkitTokenCountEstimator();
// TODO xiaoxin 临时测试用后续删
@Value("classpath:/webapp/test/Fel.pdf") @Value("classpath:/webapp/test/Fel.pdf")
private org.springframework.core.io.Resource data; private org.springframework.core.io.Resource data;
@ -49,18 +53,23 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
@Override @Override
@Transactional(rollbackFor = Exception.class) @Transactional(rollbackFor = Exception.class)
public Long createKnowledgeDocument(AiKnowledgeDocumentCreateReqVO createReqVO) { public Long createKnowledgeDocument(AiKnowledgeDocumentCreateReqVO createReqVO) {
AiKnowledgeDocumentDO documentDO = BeanUtils.toBean(createReqVO, AiKnowledgeDocumentDO.class);
documentDO
//todo
.setTokens(0).setWordCount(0)
.setStatus(CommonStatusEnum.ENABLE.getStatus()).setSliceStatus(AiKnowledgeDocumentStatusEnum.SUCCESS.getStatus());
documentMapper.insert(documentDO);
// TODO xiaoxin 后续从 url 加载
TikaDocumentReader loader = new TikaDocumentReader(data); TikaDocumentReader loader = new TikaDocumentReader(data);
// 加载文档
List<Document> documents = loader.get(); List<Document> documents = loader.get();
Document document = CollUtil.getFirst(documents);
// TODO 芋艿 文档层面有没有可能会比较大这两个字段是否可以从分段表计算得出
Integer tokens = Objects.nonNull(document) ? TOKEN_COUNT_ESTIMATOR.estimate(document.getContent()) : 0;
Integer wordCount = Objects.nonNull(document) ? document.getContent().length() : 0;
AiKnowledgeDocumentDO documentDO = BeanUtils.toBean(createReqVO, AiKnowledgeDocumentDO.class);
documentDO.setTokens(tokens).setWordCount(wordCount)
.setStatus(CommonStatusEnum.ENABLE.getStatus()).setSliceStatus(AiKnowledgeDocumentStatusEnum.SUCCESS.getStatus());
// 文档记录入库
documentMapper.insert(documentDO);
Long documentId = documentDO.getId(); Long documentId = documentDO.getId();
if (CollUtil.isEmpty(documents)) { if (CollUtil.isEmpty(documents)) {
log.info("文档内容为空");
return documentId; return documentId;
} }
@ -69,10 +78,8 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
List<AiKnowledgeSegmentDO> segmentDOList = CollectionUtils.convertList(segments, List<AiKnowledgeSegmentDO> segmentDOList = CollectionUtils.convertList(segments,
segment -> new AiKnowledgeSegmentDO().setContent(segment.getContent()).setDocumentId(documentId) segment -> new AiKnowledgeSegmentDO().setContent(segment.getContent()).setDocumentId(documentId)
//todo .setTokens(TOKEN_COUNT_ESTIMATOR.estimate(segment.getContent())).setWordCount(segment.getContent().length())
.setTokens(0).setWordCount(0)
.setStatus(CommonStatusEnum.ENABLE.getStatus())); .setStatus(CommonStatusEnum.ENABLE.getStatus()));
// 分段内容入库 // 分段内容入库
segmentMapper.insertBatch(segmentDOList); segmentMapper.insertBatch(segmentDOList);