diff --git a/backend/pom.xml b/backend/pom.xml index ed1252c..97739ea 100644 --- a/backend/pom.xml +++ b/backend/pom.xml @@ -75,6 +75,13 @@ pom import + + com.google.protobuf + protobuf-bom + 3.25.5 + pom + import + org.springframework.cloud spring-cloud-dependencies @@ -165,12 +172,6 @@ - - org.springframework.ai - spring-ai-starter-mcp-server-webmvc - ${spring-ai.version} - - com.baomidou mybatis-plus-spring-boot3-starter diff --git a/backend/services/main-application/src/main/resources/application.yml b/backend/services/main-application/src/main/resources/application.yml index 51206ba..6c66d63 100644 --- a/backend/services/main-application/src/main/resources/application.yml +++ b/backend/services/main-application/src/main/resources/application.yml @@ -167,13 +167,5 @@ datamate: # RAG配置 rag: - embedding: - model: ${RAG_EMBEDDING_MODEL:text-embedding-ada-002} - api-key: ${RAG_API_KEY:} - dimension: ${RAG_DIMENSION:1536} - chunk: - size: ${RAG_CHUNK_SIZE:512} - overlap: ${RAG_CHUNK_OVERLAP:50} - retrieval: - top-k: ${RAG_TOP_K:5} - score-threshold: ${RAG_SCORE_THRESHOLD:0.7} + milvus-host: ${MILVUS_HOST:milvus-standalone} + milvus-port: ${MILVUS_PORT:19530} diff --git a/backend/services/rag-indexer-service/pom.xml b/backend/services/rag-indexer-service/pom.xml index 454a3d9..cef1760 100644 --- a/backend/services/rag-indexer-service/pom.xml +++ b/backend/services/rag-indexer-service/pom.xml @@ -98,15 +98,12 @@ dev.langchain4j langchain4j-milvus - - - - dev.langchain4j - langchain4j-embeddings-all-minilm-l6-v2 - - - org.testcontainers - milvus + + + com.google.protobuf + protobuf-java + + diff --git a/backend/services/rag-indexer-service/src/main/java/com/dataengine/rag/indexer/RagApplication.java b/backend/services/rag-indexer-service/src/main/java/com/dataengine/rag/indexer/RagApplication.java deleted file mode 100644 index 4f5af38..0000000 --- a/backend/services/rag-indexer-service/src/main/java/com/dataengine/rag/indexer/RagApplication.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.dataengine.rag.indexer; - -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; - -/** - * - * - * @author dallas - * @since 2025-10-13 - */ -@SpringBootApplication -public class RagApplication { - public static void main(String[] args) { - SpringApplication.run(RagApplication.class, args); - } -} diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/application/KnowledgeBaseService.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/application/KnowledgeBaseService.java index 989816d..65efb91 100644 --- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/application/KnowledgeBaseService.java +++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/application/KnowledgeBaseService.java @@ -93,13 +93,13 @@ public class KnowledgeBaseService { List ragFiles = request.getFiles().stream().map(fileInfo -> { RagFile ragFile = new RagFile(); ragFile.setKnowledgeBaseId(knowledgeBase.getId()); - ragFile.setFileId(fileInfo.fileId()); - ragFile.setFileName(fileInfo.fileName()); + ragFile.setFileId(fileInfo.id()); + ragFile.setFileName(fileInfo.name()); ragFile.setStatus(FileStatus.UNPROCESSED); return ragFile; }).toList(); ragFileRepository.saveBatch(ragFiles, 100); - eventPublisher.publishEvent(new DataInsertedEvent(knowledgeBase.getId(), request.getProcessType())); + eventPublisher.publishEvent(new DataInsertedEvent(knowledgeBase, request.getProcessType())); } public PagedResponse listFiles(String knowledgeBaseId, RagFileReq request) { diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/DataInsertedEvent.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/DataInsertedEvent.java index 417de1e..af749b2 100644 --- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/DataInsertedEvent.java +++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/DataInsertedEvent.java @@ -1,5 +1,6 @@ package com.datamate.rag.indexer.infrastructure.event; +import com.datamate.rag.indexer.domain.model.KnowledgeBase; import com.datamate.rag.indexer.interfaces.dto.ProcessType; /** @@ -8,5 +9,5 @@ import com.datamate.rag.indexer.interfaces.dto.ProcessType; * @author dallas * @since 2025-10-29 */ -public record DataInsertedEvent(String knowledgeBaseId, ProcessType processType) { +public record DataInsertedEvent(KnowledgeBase knowledgeBase, ProcessType processType) { } diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java index 5c9979e..75d4e5e 100644 --- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java +++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/infrastructure/event/RagEtlService.java @@ -23,12 +23,11 @@ import dev.langchain4j.data.document.transformer.jsoup.HtmlToTextDocumentTransfo import dev.langchain4j.data.embedding.Embedding; import dev.langchain4j.data.segment.TextSegment; import dev.langchain4j.model.embedding.EmbeddingModel; -import dev.langchain4j.model.output.Response; import dev.langchain4j.store.embedding.EmbeddingStore; import dev.langchain4j.store.embedding.milvus.MilvusEmbeddingStore; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.jetbrains.annotations.NotNull; +import org.springframework.beans.factory.annotation.Value; import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; import org.springframework.transaction.event.TransactionPhase; @@ -52,6 +51,11 @@ import java.util.concurrent.Semaphore; public class RagEtlService { private static final Semaphore SEMAPHORE = new Semaphore(10); + @Value("${datamate.rag.milvus-host}") + private String milvusHost; + @Value("${datamate.rag.milvus-port}") + private int milvusPort; + private final RagFileRepository ragFileRepository; private final DatasetFileRepository datasetFileRepository; @@ -64,7 +68,7 @@ public class RagEtlService { @TransactionalEventListener(phase = TransactionPhase.AFTER_COMMIT) public void processAfterCommit(DataInsertedEvent event) { // 执行 RAG 处理流水线 - List ragFiles = ragFileRepository.findByKnowledgeBaseId(event.knowledgeBaseId()); + List ragFiles = ragFileRepository.findByKnowledgeBaseId(event.knowledgeBase().getId()); ragFiles.forEach(ragFile -> { try { @@ -74,12 +78,13 @@ public class RagEtlService { // 执行 RAG 处理流水线 ragFile.setStatus(FileStatus.PROCESSING); ragFileRepository.updateById(ragFile); - processRagFile(ragFile, event.processType()); + processRagFile(ragFile, event); // 更新文件状态为已处理 ragFile.setStatus(FileStatus.PROCESSED); ragFileRepository.updateById(ragFile); } catch (Exception e) { // 处理异常 + log.error("Error processing RAG file: {}", ragFile.getFileId(), e); ragFile.setStatus(FileStatus.PROCESS_FAILED); ragFileRepository.updateById(ragFile); } finally { @@ -93,7 +98,7 @@ public class RagEtlService { ); } - private void processRagFile(RagFile ragFile, ProcessType processType) { + private void processRagFile(RagFile ragFile, DataInsertedEvent event) { DatasetFile file = datasetFileRepository.getById(ragFile.getFileId()); // 使用文档解析器解析文档 DocumentParser parser = documentParser(file.getFileType()); @@ -101,10 +106,10 @@ public class RagEtlService { Document document = FileSystemDocumentLoader.loadDocument(file.getFilePath(), parser); // 对html文档进行转换 if (Arrays.asList("html", "htm").contains(file.getFileType().toLowerCase())) { - document= new HtmlToTextDocumentTransformer().transform(document); + document = new HtmlToTextDocumentTransformer().transform(document); } // 使用文档分块器对文档进行分块 - DocumentSplitter splitter = documentSplitter(processType); + DocumentSplitter splitter = documentSplitter(event.processType()); List split = splitter.split(document); // 更新分块数量 @@ -112,12 +117,12 @@ public class RagEtlService { ragFileRepository.updateById(ragFile); // 调用模型客户端获取嵌入模型 - ModelConfig model = modelConfigRepository.getById("1"); + ModelConfig model = modelConfigRepository.getById(event.knowledgeBase().getEmbeddingModel()); EmbeddingModel embeddingModel = ModelClient.invokeEmbeddingModel(model); // 调用嵌入模型获取嵌入向量 - Response<@NotNull List> response = embeddingModel.embedAll(split); + List content = embeddingModel.embedAll(split).content(); // 存储嵌入向量到 Milvus - embeddingStore().addAll(response.content(), split); + embeddingStore(embeddingModel, ragFile.getKnowledgeBaseId()).addAll(content, split); } /** @@ -139,19 +144,20 @@ public class RagEtlService { public DocumentSplitter documentSplitter(ProcessType processType) { return switch (processType) { - case CHAPTER_CHUNK -> new DocumentByParagraphSplitter(1000, 100); - case PARAGRAPH_CHUNK -> new DocumentByLineSplitter(1000, 100); - case LENGTH_CHUNK -> new DocumentBySentenceSplitter(1000, 100); - case CUSTOM_SEPARATOR_CHUNK -> new DocumentByWordSplitter(1000, 100); - case DEFAULT_CHUNK -> new DocumentByRegexSplitter("\\n\\n", "",1000, 100); + case PARAGRAPH_CHUNK -> new DocumentByParagraphSplitter(1000, 100); + case CHAPTER_CHUNK -> new DocumentByLineSplitter(1000, 100); + case CUSTOM_SEPARATOR_CHUNK -> new DocumentBySentenceSplitter(1000, 100); + case LENGTH_CHUNK -> new DocumentByWordSplitter(1000, 100); + case DEFAULT_CHUNK -> new DocumentByLineSplitter(1000, 100); }; } - public EmbeddingStore embeddingStore() { + public EmbeddingStore embeddingStore(EmbeddingModel embeddingModel, String knowledgeBaseId) { return MilvusEmbeddingStore.builder() - .uri("http://milvus:19530") - .collectionName("rag_embeddings") - .dimension(1536) + .host(milvusHost) + .port(milvusPort) + .collectionName("datamate_" + knowledgeBaseId) + .dimension(embeddingModel.dimension()) .build(); } } diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/EmbeddingController.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/EmbeddingController.java deleted file mode 100644 index 06963dc..0000000 --- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/EmbeddingController.java +++ /dev/null @@ -1,8 +0,0 @@ -package com.datamate.rag.indexer.interfaces; - -import org.springframework.web.bind.annotation.RestController; - -@RestController -public class EmbeddingController { - -} diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/KnowledgeBaseController.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/KnowledgeBaseController.java index d0ed099..b0450e0 100644 --- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/KnowledgeBaseController.java +++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/KnowledgeBaseController.java @@ -4,14 +4,14 @@ import com.datamate.rag.indexer.application.KnowledgeBaseService; import com.datamate.rag.indexer.domain.model.KnowledgeBase; import com.datamate.rag.indexer.domain.model.RagChunk; import com.datamate.rag.indexer.domain.model.RagFile; -import com.datamate.common.infrastructure.common.Response; import com.datamate.common.interfaces.PagedResponse; import com.datamate.common.interfaces.PagingQuery; import com.datamate.rag.indexer.interfaces.dto.*; +import jakarta.validation.Valid; import lombok.RequiredArgsConstructor; import org.springframework.web.bind.annotation.*; -import javax.validation.Valid; + /** * 知识库控制器 @@ -21,15 +21,10 @@ import javax.validation.Valid; */ @RestController @RequiredArgsConstructor -@RequestMapping("/v1/knowledge-base") +@RequestMapping("/knowledge-base") public class KnowledgeBaseController { private final KnowledgeBaseService knowledgeBaseService; - @GetMapping(path = "/test1") - public String test() { - return "test1"; - } - /** * 创建知识库 * @@ -105,7 +100,7 @@ public class KnowledgeBaseController { */ @GetMapping("/{knowledgeBaseId}/files") public PagedResponse listFiles(@PathVariable("knowledgeBaseId") String knowledgeBaseId, - @RequestBody @Valid RagFileReq request) { + RagFileReq request) { return knowledgeBaseService.listFiles(knowledgeBaseId, request); } diff --git a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/dto/AddFilesReq.java b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/dto/AddFilesReq.java index 52568a2..5edd578 100644 --- a/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/dto/AddFilesReq.java +++ b/backend/services/rag-indexer-service/src/main/java/com/datamate/rag/indexer/interfaces/dto/AddFilesReq.java @@ -18,6 +18,6 @@ public class AddFilesReq { private ProcessType processType; private List files; - public record FileInfo(String fileId, String fileName) { + public record FileInfo(String id, String name) { } }