From e854a0288aee80dbcc329e883714e204da77110d Mon Sep 17 00:00:00 2001
From: Dallas98 <40557804+Dallas98@users.noreply.github.com>
Date: Fri, 31 Oct 2025 13:16:05 +0800
Subject: [PATCH] feat: update knowledge base processing to use KnowledgeBase
object and enhance configuration (#46)
* feat: update knowledge base processing to use KnowledgeBase object and enhance configuration
---
backend/pom.xml | 13 +++---
.../src/main/resources/application.yml | 12 +----
backend/services/rag-indexer-service/pom.xml | 15 +++----
.../rag/indexer/RagApplication.java | 17 -------
.../application/KnowledgeBaseService.java | 6 +--
.../event/DataInsertedEvent.java | 3 +-
.../infrastructure/event/RagEtlService.java | 44 +++++++++++--------
.../interfaces/EmbeddingController.java | 8 ----
.../interfaces/KnowledgeBaseController.java | 13 ++----
.../indexer/interfaces/dto/AddFilesReq.java | 2 +-
10 files changed, 50 insertions(+), 83 deletions(-)
delete mode 100644 backend/services/rag-indexer-service/src/main/java/com/dataengine/rag/indexer/RagApplication.java
delete mode 100644 backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/EmbeddingController.java
diff --git a/backend/pom.xml b/backend/pom.xml
index ed1252c..97739ea 100644
--- a/backend/pom.xml
+++ b/backend/pom.xml
@@ -75,6 +75,13 @@
pom
import
+
+ com.google.protobuf
+ protobuf-bom
+ 3.25.5
+ pom
+ import
+
org.springframework.cloud
spring-cloud-dependencies
@@ -165,12 +172,6 @@
-
- org.springframework.ai
- spring-ai-starter-mcp-server-webmvc
- ${spring-ai.version}
-
-
com.baomidou
mybatis-plus-spring-boot3-starter
diff --git a/backend/services/main-application/src/main/resources/application.yml b/backend/services/main-application/src/main/resources/application.yml
index 51206ba..6c66d63 100644
--- a/backend/services/main-application/src/main/resources/application.yml
+++ b/backend/services/main-application/src/main/resources/application.yml
@@ -167,13 +167,5 @@ datamate:
# RAG配置
rag:
- embedding:
- model: ${RAG_EMBEDDING_MODEL:text-embedding-ada-002}
- api-key: ${RAG_API_KEY:}
- dimension: ${RAG_DIMENSION:1536}
- chunk:
- size: ${RAG_CHUNK_SIZE:512}
- overlap: ${RAG_CHUNK_OVERLAP:50}
- retrieval:
- top-k: ${RAG_TOP_K:5}
- score-threshold: ${RAG_SCORE_THRESHOLD:0.7}
+ milvus-host: ${MILVUS_HOST:milvus-standalone}
+ milvus-port: ${MILVUS_PORT:19530}
diff --git a/backend/services/rag-indexer-service/pom.xml b/backend/services/rag-indexer-service/pom.xml
index 454a3d9..cef1760 100644
--- a/backend/services/rag-indexer-service/pom.xml
+++ b/backend/services/rag-indexer-service/pom.xml
@@ -98,15 +98,12 @@
dev.langchain4j
langchain4j-milvus
-
-
-
- dev.langchain4j
- langchain4j-embeddings-all-minilm-l6-v2
-
-
- org.testcontainers
- milvus
+
+
+ com.google.protobuf
+ protobuf-java
+
+
diff --git a/backend/services/rag-indexer-service/src/main/java/com/dataengine/rag/indexer/RagApplication.java b/backend/services/rag-indexer-service/src/main/java/com/dataengine/rag/indexer/RagApplication.java
deleted file mode 100644
index 4f5af38..0000000
--- a/backend/services/rag-indexer-service/src/main/java/com/dataengine/rag/indexer/RagApplication.java
+++ /dev/null
@@ -1,17 +0,0 @@
-package com.dataengine.rag.indexer;
-
-import org.springframework.boot.SpringApplication;
-import org.springframework.boot.autoconfigure.SpringBootApplication;
-
-/**
- *
- *
- * @author dallas
- * @since 2025-10-13
- */
-@SpringBootApplication
-public class RagApplication {
- public static void main(String[] args) {
- SpringApplication.run(RagApplication.class, args);
- }
-}
diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/application/KnowledgeBaseService.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/application/KnowledgeBaseService.java
index 989816d..65efb91 100644
--- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/application/KnowledgeBaseService.java
+++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/application/KnowledgeBaseService.java
@@ -93,13 +93,13 @@ public class KnowledgeBaseService {
List ragFiles = request.getFiles().stream().map(fileInfo -> {
RagFile ragFile = new RagFile();
ragFile.setKnowledgeBaseId(knowledgeBase.getId());
- ragFile.setFileId(fileInfo.fileId());
- ragFile.setFileName(fileInfo.fileName());
+ ragFile.setFileId(fileInfo.id());
+ ragFile.setFileName(fileInfo.name());
ragFile.setStatus(FileStatus.UNPROCESSED);
return ragFile;
}).toList();
ragFileRepository.saveBatch(ragFiles, 100);
- eventPublisher.publishEvent(new DataInsertedEvent(knowledgeBase.getId(), request.getProcessType()));
+ eventPublisher.publishEvent(new DataInsertedEvent(knowledgeBase, request.getProcessType()));
}
public PagedResponse listFiles(String knowledgeBaseId, RagFileReq request) {
diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/DataInsertedEvent.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/DataInsertedEvent.java
index 417de1e..af749b2 100644
--- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/DataInsertedEvent.java
+++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/DataInsertedEvent.java
@@ -1,5 +1,6 @@
package com.datamate.rag.indexer.infrastructure.event;
+import com.datamate.rag.indexer.domain.model.KnowledgeBase;
import com.datamate.rag.indexer.interfaces.dto.ProcessType;
/**
@@ -8,5 +9,5 @@ import com.datamate.rag.indexer.interfaces.dto.ProcessType;
* @author dallas
* @since 2025-10-29
*/
-public record DataInsertedEvent(String knowledgeBaseId, ProcessType processType) {
+public record DataInsertedEvent(KnowledgeBase knowledgeBase, ProcessType processType) {
}
diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java
index 5c9979e..75d4e5e 100644
--- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java
+++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java
@@ -23,12 +23,11 @@ import dev.langchain4j.data.document.transformer.jsoup.HtmlToTextDocumentTransfo
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.embedding.EmbeddingModel;
-import dev.langchain4j.model.output.Response;
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.milvus.MilvusEmbeddingStore;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
-import org.jetbrains.annotations.NotNull;
+import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.transaction.event.TransactionPhase;
@@ -52,6 +51,11 @@ import java.util.concurrent.Semaphore;
public class RagEtlService {
private static final Semaphore SEMAPHORE = new Semaphore(10);
+ @Value("${datamate.rag.milvus-host}")
+ private String milvusHost;
+ @Value("${datamate.rag.milvus-port}")
+ private int milvusPort;
+
private final RagFileRepository ragFileRepository;
private final DatasetFileRepository datasetFileRepository;
@@ -64,7 +68,7 @@ public class RagEtlService {
@TransactionalEventListener(phase = TransactionPhase.AFTER_COMMIT)
public void processAfterCommit(DataInsertedEvent event) {
// 执行 RAG 处理流水线
- List ragFiles = ragFileRepository.findByKnowledgeBaseId(event.knowledgeBaseId());
+ List ragFiles = ragFileRepository.findByKnowledgeBaseId(event.knowledgeBase().getId());
ragFiles.forEach(ragFile -> {
try {
@@ -74,12 +78,13 @@ public class RagEtlService {
// 执行 RAG 处理流水线
ragFile.setStatus(FileStatus.PROCESSING);
ragFileRepository.updateById(ragFile);
- processRagFile(ragFile, event.processType());
+ processRagFile(ragFile, event);
// 更新文件状态为已处理
ragFile.setStatus(FileStatus.PROCESSED);
ragFileRepository.updateById(ragFile);
} catch (Exception e) {
// 处理异常
+ log.error("Error processing RAG file: {}", ragFile.getFileId(), e);
ragFile.setStatus(FileStatus.PROCESS_FAILED);
ragFileRepository.updateById(ragFile);
} finally {
@@ -93,7 +98,7 @@ public class RagEtlService {
);
}
- private void processRagFile(RagFile ragFile, ProcessType processType) {
+ private void processRagFile(RagFile ragFile, DataInsertedEvent event) {
DatasetFile file = datasetFileRepository.getById(ragFile.getFileId());
// 使用文档解析器解析文档
DocumentParser parser = documentParser(file.getFileType());
@@ -101,10 +106,10 @@ public class RagEtlService {
Document document = FileSystemDocumentLoader.loadDocument(file.getFilePath(), parser);
// 对html文档进行转换
if (Arrays.asList("html", "htm").contains(file.getFileType().toLowerCase())) {
- document= new HtmlToTextDocumentTransformer().transform(document);
+ document = new HtmlToTextDocumentTransformer().transform(document);
}
// 使用文档分块器对文档进行分块
- DocumentSplitter splitter = documentSplitter(processType);
+ DocumentSplitter splitter = documentSplitter(event.processType());
List split = splitter.split(document);
// 更新分块数量
@@ -112,12 +117,12 @@ public class RagEtlService {
ragFileRepository.updateById(ragFile);
// 调用模型客户端获取嵌入模型
- ModelConfig model = modelConfigRepository.getById("1");
+ ModelConfig model = modelConfigRepository.getById(event.knowledgeBase().getEmbeddingModel());
EmbeddingModel embeddingModel = ModelClient.invokeEmbeddingModel(model);
// 调用嵌入模型获取嵌入向量
- Response<@NotNull List> response = embeddingModel.embedAll(split);
+ List content = embeddingModel.embedAll(split).content();
// 存储嵌入向量到 Milvus
- embeddingStore().addAll(response.content(), split);
+ embeddingStore(embeddingModel, ragFile.getKnowledgeBaseId()).addAll(content, split);
}
/**
@@ -139,19 +144,20 @@ public class RagEtlService {
public DocumentSplitter documentSplitter(ProcessType processType) {
return switch (processType) {
- case CHAPTER_CHUNK -> new DocumentByParagraphSplitter(1000, 100);
- case PARAGRAPH_CHUNK -> new DocumentByLineSplitter(1000, 100);
- case LENGTH_CHUNK -> new DocumentBySentenceSplitter(1000, 100);
- case CUSTOM_SEPARATOR_CHUNK -> new DocumentByWordSplitter(1000, 100);
- case DEFAULT_CHUNK -> new DocumentByRegexSplitter("\\n\\n", "",1000, 100);
+ case PARAGRAPH_CHUNK -> new DocumentByParagraphSplitter(1000, 100);
+ case CHAPTER_CHUNK -> new DocumentByLineSplitter(1000, 100);
+ case CUSTOM_SEPARATOR_CHUNK -> new DocumentBySentenceSplitter(1000, 100);
+ case LENGTH_CHUNK -> new DocumentByWordSplitter(1000, 100);
+ case DEFAULT_CHUNK -> new DocumentByLineSplitter(1000, 100);
};
}
- public EmbeddingStore embeddingStore() {
+ public EmbeddingStore embeddingStore(EmbeddingModel embeddingModel, String knowledgeBaseId) {
return MilvusEmbeddingStore.builder()
- .uri("http://milvus:19530")
- .collectionName("rag_embeddings")
- .dimension(1536)
+ .host(milvusHost)
+ .port(milvusPort)
+ .collectionName("datamate_" + knowledgeBaseId)
+ .dimension(embeddingModel.dimension())
.build();
}
}
diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/EmbeddingController.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/EmbeddingController.java
deleted file mode 100644
index 06963dc..0000000
--- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/EmbeddingController.java
+++ /dev/null
@@ -1,8 +0,0 @@
-package com.datamate.rag.indexer.interfaces;
-
-import org.springframework.web.bind.annotation.RestController;
-
-@RestController
-public class EmbeddingController {
-
-}
diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/KnowledgeBaseController.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/KnowledgeBaseController.java
index d0ed099..b0450e0 100644
--- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/KnowledgeBaseController.java
+++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/KnowledgeBaseController.java
@@ -4,14 +4,14 @@ import com.datamate.rag.indexer.application.KnowledgeBaseService;
import com.datamate.rag.indexer.domain.model.KnowledgeBase;
import com.datamate.rag.indexer.domain.model.RagChunk;
import com.datamate.rag.indexer.domain.model.RagFile;
-import com.datamate.common.infrastructure.common.Response;
import com.datamate.common.interfaces.PagedResponse;
import com.datamate.common.interfaces.PagingQuery;
import com.datamate.rag.indexer.interfaces.dto.*;
+import jakarta.validation.Valid;
import lombok.RequiredArgsConstructor;
import org.springframework.web.bind.annotation.*;
-import javax.validation.Valid;
+
/**
* 知识库控制器
@@ -21,15 +21,10 @@ import javax.validation.Valid;
*/
@RestController
@RequiredArgsConstructor
-@RequestMapping("/v1/knowledge-base")
+@RequestMapping("/knowledge-base")
public class KnowledgeBaseController {
private final KnowledgeBaseService knowledgeBaseService;
- @GetMapping(path = "/test1")
- public String test() {
- return "test1";
- }
-
/**
* 创建知识库
*
@@ -105,7 +100,7 @@ public class KnowledgeBaseController {
*/
@GetMapping("/{knowledgeBaseId}/files")
public PagedResponse listFiles(@PathVariable("knowledgeBaseId") String knowledgeBaseId,
- @RequestBody @Valid RagFileReq request) {
+ RagFileReq request) {
return knowledgeBaseService.listFiles(knowledgeBaseId, request);
}
diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/dto/AddFilesReq.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/dto/AddFilesReq.java
index 52568a2..5edd578 100644
--- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/dto/AddFilesReq.java
+++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/dto/AddFilesReq.java
@@ -18,6 +18,6 @@ public class AddFilesReq {
private ProcessType processType;
private List files;
- public record FileInfo(String fileId, String fileName) {
+ public record FileInfo(String id, String name) {
}
}