From 5a553ddde3af46bdd7dea63d50548de45255a4f2 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Tue, 17 Feb 2026 20:42:55 +0800 Subject: [PATCH] =?UTF-8?q?feat(knowledge-graph):=20=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E7=9F=A5=E8=AF=86=E5=9B=BE=E8=B0=B1=E5=9F=BA=E7=A1=80=E8=AE=BE?= =?UTF-8?q?=E6=96=BD=E6=90=AD=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 实现功能: - Neo4j Docker Compose 配置(社区版,端口 7474/7687,数据持久化) - Makefile 新增 Neo4j 命令(neo4j-up/down/logs/shell) - knowledge-graph-service Spring Boot 服务(完整的 DDD 分层架构) - kg_extraction Python 模块(基于 LangChain LLMGraphTransformer) 技术实现: - Neo4j 配置:环境变量化密码,统一默认值 datamate123 - Java 服务: - Domain: GraphEntity, GraphRelation 实体模型 - Repository: Spring Data Neo4j,支持 graphId 范围查询 - Service: 业务逻辑,graphId 双重校验,查询限流 - Controller: REST API,UUID 格式校验 - Exception: 实现 ErrorCode 接口,统一异常体系 - Python 模块: - KnowledgeGraphExtractor 类 - 支持异步/同步/批量抽取 - 支持 schema-guided 模式 - 兼容 OpenAI 及自部署模型 关键设计: - graphId 权限边界:所有实体操作都在正确的 graphId 范围内 - 查询限流:depth 和 limit 参数受配置约束 - 异常处理:统一使用 BusinessException + ErrorCode - 凭据管理:环境变量化,避免硬编码 - 双重防御:Controller 格式校验 + Service 业务校验 代码审查: - 经过 3 轮 Codex 审查和 2 轮 Claude 修复 - 所有 P0 和 P1 问题已解决 - 编译通过,无阻塞性问题 文件变更: - 新增:Neo4j 配置、knowledge-graph-service(11 个 Java 文件)、kg_extraction(3 个 Python 文件) - 修改:Makefile、pom.xml、application.yml、pyproject.toml --- Makefile | 28 +++ .../services/knowledge-graph-service/pom.xml | 109 +++++++++++ .../KnowledgeGraphServiceConfiguration.java | 11 ++ .../application/GraphEntityService.java | 120 ++++++++++++ .../domain/model/GraphEntity.java | 81 ++++++++ .../domain/model/GraphRelation.java | 61 ++++++ .../repository/GraphEntityRepository.java | 44 +++++ .../exception/KnowledgeGraphErrorCode.java | 25 +++ .../neo4j/KnowledgeGraphProperties.java | 20 ++ .../interfaces/dto/CreateEntityRequest.java | 31 +++ .../interfaces/dto/CreateRelationRequest.java | 28 +++ .../interfaces/dto/UpdateEntityRequest.java | 18 ++ .../rest/GraphEntityController.java | 80 ++++++++ .../resources/application-knowledgegraph.yml | 25 +++ backend/services/main-application/pom.xml | 7 + .../src/main/resources/application.yml | 1 + backend/services/pom.xml | 3 + deployment/docker/neo4j/docker-compose.yml | 39 ++++ .../app/module/kg_extraction/__init__.py | 17 ++ .../app/module/kg_extraction/extractor.py | 183 ++++++++++++++++++ .../app/module/kg_extraction/models.py | 75 +++++++ runtime/datamate-python/pyproject.toml | 1 + 22 files changed, 1007 insertions(+) create mode 100644 backend/services/knowledge-graph-service/pom.xml create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/KnowledgeGraphServiceConfiguration.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/application/GraphEntityService.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/model/GraphEntity.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/model/GraphRelation.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/repository/GraphEntityRepository.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/infrastructure/exception/KnowledgeGraphErrorCode.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/infrastructure/neo4j/KnowledgeGraphProperties.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/CreateEntityRequest.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/CreateRelationRequest.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/UpdateEntityRequest.java create mode 100644 backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/rest/GraphEntityController.java create mode 100644 backend/services/knowledge-graph-service/src/main/resources/application-knowledgegraph.yml create mode 100644 deployment/docker/neo4j/docker-compose.yml create mode 100644 runtime/datamate-python/app/module/kg_extraction/__init__.py create mode 100644 runtime/datamate-python/app/module/kg_extraction/extractor.py create mode 100644 runtime/datamate-python/app/module/kg_extraction/models.py diff --git a/Makefile b/Makefile index a761c1b..b0962fa 100644 --- a/Makefile +++ b/Makefile @@ -76,6 +76,12 @@ help: @echo " make download SAVE=true PLATFORM=linux/arm64 Save ARM64 images" @echo " make load-images Load all downloaded images from dist/" @echo "" + @echo "Neo4j Commands:" + @echo " make neo4j-up Start Neo4j graph database" + @echo " make neo4j-down Stop Neo4j graph database" + @echo " make neo4j-logs View Neo4j logs" + @echo " make neo4j-shell Open Neo4j Cypher shell" + @echo "" @echo "Utility Commands:" @echo " make create-namespace Create Kubernetes namespace" @echo " make help Show this help message" @@ -498,3 +504,25 @@ load-images: else \ echo "Successfully loaded $$count image(s)"; \ fi + +# ========== Neo4j Targets ========== + +.PHONY: neo4j-up +neo4j-up: + @echo "Starting Neo4j graph database..." + docker compose -f deployment/docker/neo4j/docker-compose.yml up -d + @echo "Neo4j Browser: http://localhost:7474" + @echo "Bolt URI: bolt://localhost:7687" + +.PHONY: neo4j-down +neo4j-down: + @echo "Stopping Neo4j graph database..." + docker compose -f deployment/docker/neo4j/docker-compose.yml down + +.PHONY: neo4j-logs +neo4j-logs: + docker compose -f deployment/docker/neo4j/docker-compose.yml logs -f + +.PHONY: neo4j-shell +neo4j-shell: + docker exec -it datamate-neo4j cypher-shell -u neo4j -p "$${NEO4J_PASSWORD:-datamate123}" diff --git a/backend/services/knowledge-graph-service/pom.xml b/backend/services/knowledge-graph-service/pom.xml new file mode 100644 index 0000000..276d770 --- /dev/null +++ b/backend/services/knowledge-graph-service/pom.xml @@ -0,0 +1,109 @@ + + + 4.0.0 + + + com.datamate + services + 1.0.0-SNAPSHOT + ../pom.xml + + + knowledge-graph-service + Knowledge Graph Service + 知识图谱服务 - 基于Neo4j的实体关系管理与图谱查询 + + + + com.datamate + domain-common + ${project.version} + + + + + org.springframework.boot + spring-boot-starter-data-neo4j + + + + org.springframework.boot + spring-boot-starter-web + + + org.springframework.boot + spring-boot-starter-data-redis + + + + com.mysql + mysql-connector-j + ${mysql.version} + + + + org.springframework.boot + spring-boot-starter-test + test + + + + org.springdoc + springdoc-openapi-starter-webmvc-ui + + + org.openapitools + jackson-databind-nullable + + + jakarta.validation + jakarta.validation-api + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + true + exec + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.11.0 + + ${maven.compiler.source} + ${maven.compiler.target} + + + org.projectlombok + lombok + ${lombok.version} + + + org.projectlombok + lombok-mapstruct-binding + ${lombok-mapstruct-binding.version} + + + org.mapstruct + mapstruct-processor + ${mapstruct.version} + + + + -parameters + -Amapstruct.defaultComponentModel=spring + + + + + + diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/KnowledgeGraphServiceConfiguration.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/KnowledgeGraphServiceConfiguration.java new file mode 100644 index 0000000..107fb26 --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/KnowledgeGraphServiceConfiguration.java @@ -0,0 +1,11 @@ +package com.datamate.knowledgegraph; + +import org.springframework.context.annotation.ComponentScan; +import org.springframework.context.annotation.Configuration; +import org.springframework.data.neo4j.repository.config.EnableNeo4jRepositories; + +@Configuration +@ComponentScan(basePackages = "com.datamate.knowledgegraph") +@EnableNeo4jRepositories(basePackages = "com.datamate.knowledgegraph.domain.repository") +public class KnowledgeGraphServiceConfiguration { +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/application/GraphEntityService.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/application/GraphEntityService.java new file mode 100644 index 0000000..9e654dc --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/application/GraphEntityService.java @@ -0,0 +1,120 @@ +package com.datamate.knowledgegraph.application; + +import com.datamate.common.infrastructure.exception.BusinessException; +import com.datamate.common.infrastructure.exception.SystemErrorCode; +import com.datamate.knowledgegraph.domain.model.GraphEntity; +import com.datamate.knowledgegraph.domain.repository.GraphEntityRepository; +import com.datamate.knowledgegraph.infrastructure.exception.KnowledgeGraphErrorCode; +import com.datamate.knowledgegraph.infrastructure.neo4j.KnowledgeGraphProperties; +import com.datamate.knowledgegraph.interfaces.dto.CreateEntityRequest; +import com.datamate.knowledgegraph.interfaces.dto.UpdateEntityRequest; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; + +import java.time.LocalDateTime; +import java.util.List; +import java.util.regex.Pattern; + +@Service +@Slf4j +@RequiredArgsConstructor +public class GraphEntityService { + + private static final Pattern UUID_PATTERN = Pattern.compile( + "^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$" + ); + + private final GraphEntityRepository entityRepository; + private final KnowledgeGraphProperties properties; + + @Transactional + public GraphEntity createEntity(String graphId, CreateEntityRequest request) { + validateGraphId(graphId); + GraphEntity entity = GraphEntity.builder() + .name(request.getName()) + .type(request.getType()) + .description(request.getDescription()) + .aliases(request.getAliases()) + .properties(request.getProperties()) + .sourceId(request.getSourceId()) + .sourceType(request.getSourceType()) + .graphId(graphId) + .confidence(request.getConfidence() != null ? request.getConfidence() : 1.0) + .createdAt(LocalDateTime.now()) + .updatedAt(LocalDateTime.now()) + .build(); + return entityRepository.save(entity); + } + + public GraphEntity getEntity(String graphId, String entityId) { + validateGraphId(graphId); + return entityRepository.findByIdAndGraphId(entityId, graphId) + .orElseThrow(() -> BusinessException.of(KnowledgeGraphErrorCode.ENTITY_NOT_FOUND)); + } + + public List listEntities(String graphId) { + validateGraphId(graphId); + return entityRepository.findByGraphId(graphId); + } + + public List searchEntities(String graphId, String name) { + validateGraphId(graphId); + return entityRepository.findByGraphIdAndNameContaining(graphId, name); + } + + public List listEntitiesByType(String graphId, String type) { + validateGraphId(graphId); + return entityRepository.findByGraphIdAndType(graphId, type); + } + + @Transactional + public GraphEntity updateEntity(String graphId, String entityId, UpdateEntityRequest request) { + validateGraphId(graphId); + GraphEntity entity = getEntity(graphId, entityId); + if (request.getName() != null) { + entity.setName(request.getName()); + } + if (request.getDescription() != null) { + entity.setDescription(request.getDescription()); + } + if (request.getAliases() != null) { + entity.setAliases(request.getAliases()); + } + if (request.getProperties() != null) { + entity.setProperties(request.getProperties()); + } + entity.setUpdatedAt(LocalDateTime.now()); + return entityRepository.save(entity); + } + + @Transactional + public void deleteEntity(String graphId, String entityId) { + validateGraphId(graphId); + GraphEntity entity = getEntity(graphId, entityId); + entityRepository.delete(entity); + } + + public List getNeighbors(String graphId, String entityId, int depth, int limit) { + validateGraphId(graphId); + int clampedDepth = Math.max(1, Math.min(depth, properties.getMaxDepth())); + int clampedLimit = Math.max(1, Math.min(limit, properties.getMaxNodesPerQuery())); + return entityRepository.findNeighbors(graphId, entityId, clampedDepth, clampedLimit); + } + + public long countEntities(String graphId) { + validateGraphId(graphId); + return entityRepository.countByGraphId(graphId); + } + + /** + * 校验 graphId 格式(UUID)。 + * 防止恶意构造的 graphId 注入 Cypher 查询。 + */ + private void validateGraphId(String graphId) { + if (graphId == null || !UUID_PATTERN.matcher(graphId).matches()) { + throw BusinessException.of(SystemErrorCode.INVALID_PARAMETER, "graphId 格式无效"); + } + } +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/model/GraphEntity.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/model/GraphEntity.java new file mode 100644 index 0000000..3f41212 --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/model/GraphEntity.java @@ -0,0 +1,81 @@ +package com.datamate.knowledgegraph.domain.model; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.springframework.data.neo4j.core.schema.DynamicLabels; +import org.springframework.data.neo4j.core.schema.GeneratedValue; +import org.springframework.data.neo4j.core.schema.Id; +import org.springframework.data.neo4j.core.schema.Node; +import org.springframework.data.neo4j.core.schema.Property; +import org.springframework.data.neo4j.core.support.UUIDStringGenerator; + +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * 知识图谱实体节点。 + *

+ * 在 Neo4j 中,每个实体作为一个节点存储, + * 通过 {@code type} 属性区分具体类型(Person, Organization, Concept 等), + * 并支持通过 {@code properties} 存储灵活的扩展属性。 + */ +@Node("Entity") +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class GraphEntity { + + @Id + @GeneratedValue(UUIDStringGenerator.class) + private String id; + + @Property("name") + private String name; + + @Property("type") + private String type; + + @Property("description") + private String description; + + @DynamicLabels + @Builder.Default + private List labels = new ArrayList<>(); + + @Property("aliases") + @Builder.Default + private List aliases = new ArrayList<>(); + + @Property("properties") + @Builder.Default + private Map properties = new HashMap<>(); + + /** 来源数据集/知识库的 ID */ + @Property("source_id") + private String sourceId; + + /** 来源类型:ANNOTATION, KNOWLEDGE_BASE, IMPORT, MANUAL */ + @Property("source_type") + private String sourceType; + + /** 所属图谱 ID(对应 MySQL 中的 t_dm_knowledge_graphs.id) */ + @Property("graph_id") + private String graphId; + + /** 自动抽取的置信度 */ + @Property("confidence") + @Builder.Default + private Double confidence = 1.0; + + @Property("created_at") + private LocalDateTime createdAt; + + @Property("updated_at") + private LocalDateTime updatedAt; +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/model/GraphRelation.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/model/GraphRelation.java new file mode 100644 index 0000000..4138946 --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/model/GraphRelation.java @@ -0,0 +1,61 @@ +package com.datamate.knowledgegraph.domain.model; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.springframework.data.neo4j.core.schema.GeneratedValue; +import org.springframework.data.neo4j.core.schema.Id; +import org.springframework.data.neo4j.core.schema.Property; +import org.springframework.data.neo4j.core.schema.RelationshipProperties; +import org.springframework.data.neo4j.core.schema.TargetNode; +import org.springframework.data.neo4j.core.support.UUIDStringGenerator; + +import java.time.LocalDateTime; +import java.util.HashMap; +import java.util.Map; + +/** + * 知识图谱关系(边)。 + *

+ * 使用 Spring Data Neo4j 的 {@code @RelationshipProperties} 表示带属性的关系。 + * 关系的具体类型通过 {@code relationType} 表达(如 belongs_to, located_in)。 + */ +@RelationshipProperties +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class GraphRelation { + + @Id + @GeneratedValue(UUIDStringGenerator.class) + private String id; + + @TargetNode + private GraphEntity target; + + @Property("relation_type") + private String relationType; + + @Property("properties") + @Builder.Default + private Map properties = new HashMap<>(); + + @Property("weight") + @Builder.Default + private Double weight = 1.0; + + @Property("source_id") + private String sourceId; + + @Property("confidence") + @Builder.Default + private Double confidence = 1.0; + + @Property("graph_id") + private String graphId; + + @Property("created_at") + private LocalDateTime createdAt; +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/repository/GraphEntityRepository.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/repository/GraphEntityRepository.java new file mode 100644 index 0000000..ae6a677 --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/domain/repository/GraphEntityRepository.java @@ -0,0 +1,44 @@ +package com.datamate.knowledgegraph.domain.repository; + +import com.datamate.knowledgegraph.domain.model.GraphEntity; +import org.springframework.data.neo4j.repository.Neo4jRepository; +import org.springframework.data.neo4j.repository.query.Query; +import org.springframework.data.repository.query.Param; +import org.springframework.stereotype.Repository; + +import java.util.List; +import java.util.Optional; + +@Repository +public interface GraphEntityRepository extends Neo4jRepository { + + @Query("MATCH (e:Entity {graph_id: $graphId}) WHERE e.id = $entityId RETURN e") + Optional findByIdAndGraphId( + @Param("entityId") String entityId, + @Param("graphId") String graphId); + + List findByGraphId(String graphId); + + List findByGraphIdAndType(String graphId, String type); + + List findByGraphIdAndNameContaining(String graphId, String name); + + @Query("MATCH (e:Entity {graph_id: $graphId}) " + + "WHERE e.name = $name AND e.type = $type " + + "RETURN e") + List findByGraphIdAndNameAndType( + @Param("graphId") String graphId, + @Param("name") String name, + @Param("type") String type); + + @Query("MATCH (e:Entity {graph_id: $graphId, id: $entityId})-[r*1..$depth]-(neighbor:Entity) " + + "RETURN DISTINCT neighbor LIMIT $limit") + List findNeighbors( + @Param("graphId") String graphId, + @Param("entityId") String entityId, + @Param("depth") int depth, + @Param("limit") int limit); + + @Query("MATCH (e:Entity {graph_id: $graphId}) RETURN count(e)") + long countByGraphId(@Param("graphId") String graphId); +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/infrastructure/exception/KnowledgeGraphErrorCode.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/infrastructure/exception/KnowledgeGraphErrorCode.java new file mode 100644 index 0000000..5983536 --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/infrastructure/exception/KnowledgeGraphErrorCode.java @@ -0,0 +1,25 @@ +package com.datamate.knowledgegraph.infrastructure.exception; + +import com.datamate.common.infrastructure.exception.ErrorCode; +import lombok.AllArgsConstructor; +import lombok.Getter; + +/** + * 知识图谱模块错误码 + */ +@Getter +@AllArgsConstructor +public enum KnowledgeGraphErrorCode implements ErrorCode { + + ENTITY_NOT_FOUND("knowledge_graph.0001", "实体不存在"), + RELATION_NOT_FOUND("knowledge_graph.0002", "关系不存在"), + GRAPH_NOT_FOUND("knowledge_graph.0003", "图谱不存在"), + DUPLICATE_ENTITY("knowledge_graph.0004", "实体已存在"), + INVALID_RELATION("knowledge_graph.0005", "无效的关系定义"), + IMPORT_FAILED("knowledge_graph.0006", "图谱导入失败"), + QUERY_DEPTH_EXCEEDED("knowledge_graph.0007", "查询深度超出限制"), + MAX_NODES_EXCEEDED("knowledge_graph.0008", "查询结果节点数超出限制"); + + private final String code; + private final String message; +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/infrastructure/neo4j/KnowledgeGraphProperties.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/infrastructure/neo4j/KnowledgeGraphProperties.java new file mode 100644 index 0000000..65612fb --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/infrastructure/neo4j/KnowledgeGraphProperties.java @@ -0,0 +1,20 @@ +package com.datamate.knowledgegraph.infrastructure.neo4j; + +import lombok.Data; +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.springframework.stereotype.Component; + +@Data +@Component +@ConfigurationProperties(prefix = "datamate.knowledge-graph") +public class KnowledgeGraphProperties { + + /** 默认查询跳数限制 */ + private int maxDepth = 3; + + /** 子图返回最大节点数 */ + private int maxNodesPerQuery = 500; + + /** 批量导入批次大小 */ + private int importBatchSize = 100; +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/CreateEntityRequest.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/CreateEntityRequest.java new file mode 100644 index 0000000..2e873c9 --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/CreateEntityRequest.java @@ -0,0 +1,31 @@ +package com.datamate.knowledgegraph.interfaces.dto; + +import jakarta.validation.constraints.NotBlank; +import lombok.Data; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +@Data +public class CreateEntityRequest { + + @NotBlank(message = "实体名称不能为空") + private String name; + + @NotBlank(message = "实体类型不能为空") + private String type; + + private String description; + + private List aliases = new ArrayList<>(); + + private Map properties = new HashMap<>(); + + private String sourceId; + + private String sourceType; + + private Double confidence; +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/CreateRelationRequest.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/CreateRelationRequest.java new file mode 100644 index 0000000..7a6c311 --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/CreateRelationRequest.java @@ -0,0 +1,28 @@ +package com.datamate.knowledgegraph.interfaces.dto; + +import jakarta.validation.constraints.NotBlank; +import lombok.Data; + +import java.util.HashMap; +import java.util.Map; + +@Data +public class CreateRelationRequest { + + @NotBlank(message = "源实体ID不能为空") + private String sourceEntityId; + + @NotBlank(message = "目标实体ID不能为空") + private String targetEntityId; + + @NotBlank(message = "关系类型不能为空") + private String relationType; + + private Map properties = new HashMap<>(); + + private Double weight; + + private String sourceId; + + private Double confidence; +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/UpdateEntityRequest.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/UpdateEntityRequest.java new file mode 100644 index 0000000..936caf9 --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/dto/UpdateEntityRequest.java @@ -0,0 +1,18 @@ +package com.datamate.knowledgegraph.interfaces.dto; + +import lombok.Data; + +import java.util.List; +import java.util.Map; + +@Data +public class UpdateEntityRequest { + + private String name; + + private String description; + + private List aliases; + + private Map properties; +} diff --git a/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/rest/GraphEntityController.java b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/rest/GraphEntityController.java new file mode 100644 index 0000000..30a7f4c --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/java/com/datamate/knowledgegraph/interfaces/rest/GraphEntityController.java @@ -0,0 +1,80 @@ +package com.datamate.knowledgegraph.interfaces.rest; + +import com.datamate.knowledgegraph.application.GraphEntityService; +import com.datamate.knowledgegraph.domain.model.GraphEntity; +import com.datamate.knowledgegraph.interfaces.dto.CreateEntityRequest; +import com.datamate.knowledgegraph.interfaces.dto.UpdateEntityRequest; +import jakarta.validation.Valid; +import jakarta.validation.constraints.Pattern; +import lombok.RequiredArgsConstructor; +import org.springframework.http.HttpStatus; +import org.springframework.validation.annotation.Validated; +import org.springframework.web.bind.annotation.*; + +import java.util.List; + +@RestController +@RequestMapping("/knowledge-graph/{graphId}/entities") +@RequiredArgsConstructor +@Validated +public class GraphEntityController { + + private static final String UUID_REGEX = + "^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"; + + private final GraphEntityService entityService; + + @PostMapping + @ResponseStatus(HttpStatus.CREATED) + public GraphEntity createEntity( + @PathVariable @Pattern(regexp = UUID_REGEX, message = "graphId 格式无效") String graphId, + @Valid @RequestBody CreateEntityRequest request) { + return entityService.createEntity(graphId, request); + } + + @GetMapping("/{entityId}") + public GraphEntity getEntity( + @PathVariable @Pattern(regexp = UUID_REGEX, message = "graphId 格式无效") String graphId, + @PathVariable @Pattern(regexp = UUID_REGEX, message = "entityId 格式无效") String entityId) { + return entityService.getEntity(graphId, entityId); + } + + @GetMapping + public List listEntities( + @PathVariable @Pattern(regexp = UUID_REGEX, message = "graphId 格式无效") String graphId, + @RequestParam(required = false) String type, + @RequestParam(required = false) String keyword) { + if (keyword != null && !keyword.isBlank()) { + return entityService.searchEntities(graphId, keyword); + } + if (type != null && !type.isBlank()) { + return entityService.listEntitiesByType(graphId, type); + } + return entityService.listEntities(graphId); + } + + @PutMapping("/{entityId}") + public GraphEntity updateEntity( + @PathVariable @Pattern(regexp = UUID_REGEX, message = "graphId 格式无效") String graphId, + @PathVariable @Pattern(regexp = UUID_REGEX, message = "entityId 格式无效") String entityId, + @Valid @RequestBody UpdateEntityRequest request) { + return entityService.updateEntity(graphId, entityId, request); + } + + @DeleteMapping("/{entityId}") + @ResponseStatus(HttpStatus.NO_CONTENT) + public void deleteEntity( + @PathVariable @Pattern(regexp = UUID_REGEX, message = "graphId 格式无效") String graphId, + @PathVariable @Pattern(regexp = UUID_REGEX, message = "entityId 格式无效") String entityId) { + entityService.deleteEntity(graphId, entityId); + } + + @GetMapping("/{entityId}/neighbors") + public List getNeighbors( + @PathVariable @Pattern(regexp = UUID_REGEX, message = "graphId 格式无效") String graphId, + @PathVariable @Pattern(regexp = UUID_REGEX, message = "entityId 格式无效") String entityId, + @RequestParam(defaultValue = "2") int depth, + @RequestParam(defaultValue = "50") int limit) { + return entityService.getNeighbors(graphId, entityId, depth, limit); + } +} diff --git a/backend/services/knowledge-graph-service/src/main/resources/application-knowledgegraph.yml b/backend/services/knowledge-graph-service/src/main/resources/application-knowledgegraph.yml new file mode 100644 index 0000000..20dc261 --- /dev/null +++ b/backend/services/knowledge-graph-service/src/main/resources/application-knowledgegraph.yml @@ -0,0 +1,25 @@ +# 知识图谱服务 - Neo4j连接配置 +# 该配置在 main-application 的 spring.config.import 中引入 +# 注意:生产环境务必通过环境变量 NEO4J_PASSWORD 设置密码,不要使用默认值 + +spring: + neo4j: + uri: ${NEO4J_URI:bolt://datamate-neo4j:7687} + authentication: + username: ${NEO4J_USERNAME:neo4j} + password: ${NEO4J_PASSWORD:datamate123} + pool: + max-connection-pool-size: ${NEO4J_POOL_MAX_SIZE:50} + connection-acquisition-timeout: 30s + max-connection-lifetime: 1h + log-leaked-sessions: true + +# 知识图谱服务配置 +datamate: + knowledge-graph: + # 默认查询跳数限制 + max-depth: ${KG_MAX_DEPTH:3} + # 子图返回最大节点数 + max-nodes-per-query: ${KG_MAX_NODES:500} + # 批量导入批次大小 + import-batch-size: ${KG_IMPORT_BATCH_SIZE:100} diff --git a/backend/services/main-application/pom.xml b/backend/services/main-application/pom.xml index 22a60ff..96cd1e7 100644 --- a/backend/services/main-application/pom.xml +++ b/backend/services/main-application/pom.xml @@ -109,6 +109,13 @@ ${project.version} + + + com.datamate + knowledge-graph-service + ${project.version} + + com.mysql diff --git a/backend/services/main-application/src/main/resources/application.yml b/backend/services/main-application/src/main/resources/application.yml index 3f52916..227f995 100644 --- a/backend/services/main-application/src/main/resources/application.yml +++ b/backend/services/main-application/src/main/resources/application.yml @@ -52,6 +52,7 @@ spring: import: - classpath:config/application-datacollection.yml - classpath:config/application-datamanagement.yml + - optional:classpath:application-knowledgegraph.yml # Redis配置 data: diff --git a/backend/services/pom.xml b/backend/services/pom.xml index a3caaa4..fd53194 100644 --- a/backend/services/pom.xml +++ b/backend/services/pom.xml @@ -37,6 +37,9 @@ rag-indexer-service rag-query-service + + knowledge-graph-service + main-application diff --git a/deployment/docker/neo4j/docker-compose.yml b/deployment/docker/neo4j/docker-compose.yml new file mode 100644 index 0000000..3190e13 --- /dev/null +++ b/deployment/docker/neo4j/docker-compose.yml @@ -0,0 +1,39 @@ +services: + datamate-neo4j: + container_name: datamate-neo4j + image: neo4j:5-community + restart: on-failure + ports: + - "7474:7474" # HTTP (Neo4j Browser) + - "7687:7687" # Bolt protocol + environment: + NEO4J_AUTH: neo4j/${NEO4J_PASSWORD:-datamate123} + # Memory configuration + NEO4J_server_memory_heap_initial__size: 512m + NEO4J_server_memory_heap_max__size: 1G + NEO4J_server_memory_pagecache_size: 512m + # Enable APOC plugin + NEO4J_PLUGINS: '["apoc"]' + # Transaction timeout + NEO4J_db_transaction_timeout: 60s + volumes: + - neo4j_data:/data + - neo4j_logs:/logs + networks: [ datamate ] + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:7474 || exit 1"] + interval: 15s + timeout: 10s + retries: 5 + start_period: 30s + +volumes: + neo4j_data: + name: datamate-neo4j-data-volume + neo4j_logs: + name: datamate-neo4j-logs-volume + +networks: + datamate: + driver: bridge + name: datamate-network diff --git a/runtime/datamate-python/app/module/kg_extraction/__init__.py b/runtime/datamate-python/app/module/kg_extraction/__init__.py new file mode 100644 index 0000000..56e8d7c --- /dev/null +++ b/runtime/datamate-python/app/module/kg_extraction/__init__.py @@ -0,0 +1,17 @@ +from app.module.kg_extraction.extractor import KnowledgeGraphExtractor +from app.module.kg_extraction.models import ( + ExtractionRequest, + ExtractionResult, + Triple, + GraphNode, + GraphEdge, +) + +__all__ = [ + "KnowledgeGraphExtractor", + "ExtractionRequest", + "ExtractionResult", + "Triple", + "GraphNode", + "GraphEdge", +] diff --git a/runtime/datamate-python/app/module/kg_extraction/extractor.py b/runtime/datamate-python/app/module/kg_extraction/extractor.py new file mode 100644 index 0000000..498cde8 --- /dev/null +++ b/runtime/datamate-python/app/module/kg_extraction/extractor.py @@ -0,0 +1,183 @@ +"""基于 LLM 的知识图谱三元组抽取器。 + +利用 LangChain 的 LLMGraphTransformer 从非结构化文本中抽取实体和关系, +支持 schema-guided 抽取以提升准确率。 +""" + +from __future__ import annotations + +import logging +from typing import Sequence + +from langchain_core.documents import Document +from langchain_openai import ChatOpenAI +from langchain_experimental.graph_transformers import LLMGraphTransformer + +from app.module.kg_extraction.models import ( + ExtractionRequest, + ExtractionResult, + ExtractionSchema, + GraphEdge, + GraphNode, + Triple, +) + +logger = logging.getLogger(__name__) + + +class KnowledgeGraphExtractor: + """基于 LLMGraphTransformer 的三元组抽取器。 + + Parameters + ---------- + model_name : str + OpenAI 兼容模型名称。 + base_url : str | None + 自定义 API base URL(用于对接 vLLM/Ollama 等本地模型服务)。 + api_key : str + API 密钥。 + temperature : float + 生成温度,抽取任务建议使用较低值。 + """ + + def __init__( + self, + model_name: str = "gpt-4o-mini", + base_url: str | None = None, + api_key: str = "EMPTY", + temperature: float = 0.0, + ) -> None: + self._llm = ChatOpenAI( + model=model_name, + base_url=base_url, + api_key=api_key, + temperature=temperature, + ) + + def _build_transformer( + self, + schema: ExtractionSchema | None = None, + ) -> LLMGraphTransformer: + """根据可选的 schema 约束构造 LLMGraphTransformer。""" + kwargs: dict = {"llm": self._llm} + + if schema: + if schema.entity_types: + kwargs["allowed_nodes"] = [et.name for et in schema.entity_types] + if schema.relation_types: + kwargs["allowed_relationships"] = [rt.name for rt in schema.relation_types] + + return LLMGraphTransformer(**kwargs) + + async def extract(self, request: ExtractionRequest) -> ExtractionResult: + """从文本中抽取三元组。 + + Parameters + ---------- + request : ExtractionRequest + 包含文本、schema 约束等信息的抽取请求。 + + Returns + ------- + ExtractionResult + 抽取得到的节点、边和三元组。 + """ + transformer = self._build_transformer(request.schema) + documents = [Document(page_content=request.text)] + + try: + graph_documents = await transformer.aconvert_to_graph_documents(documents) + except Exception: + logger.exception("LLM graph extraction failed for source_id=%s", request.source_id) + return ExtractionResult(raw_text=request.text, source_id=request.source_id) + + return self._convert_result(graph_documents, request) + + def extract_sync(self, request: ExtractionRequest) -> ExtractionResult: + """同步版本的三元组抽取。""" + transformer = self._build_transformer(request.schema) + documents = [Document(page_content=request.text)] + + try: + graph_documents = transformer.convert_to_graph_documents(documents) + except Exception: + logger.exception("LLM graph extraction failed for source_id=%s", request.source_id) + return ExtractionResult(raw_text=request.text, source_id=request.source_id) + + return self._convert_result(graph_documents, request) + + async def extract_batch( + self, + requests: Sequence[ExtractionRequest], + ) -> list[ExtractionResult]: + """批量抽取。 + + 对多段文本逐一抽取并汇总结果。 + 如需更高吞吐,可自行用 asyncio.gather 并发调用 extract。 + """ + results: list[ExtractionResult] = [] + for req in requests: + result = await self.extract(req) + results.append(result) + return results + + @staticmethod + def _convert_result( + graph_documents: list, + request: ExtractionRequest, + ) -> ExtractionResult: + """将 LangChain GraphDocument 转换为内部数据模型。""" + nodes: list[GraphNode] = [] + edges: list[GraphEdge] = [] + triples: list[Triple] = [] + seen_nodes: set[str] = set() + + for doc in graph_documents: + # 收集节点 + for node in doc.nodes: + node_key = f"{node.id}:{node.type}" + if node_key not in seen_nodes: + seen_nodes.add(node_key) + nodes.append( + GraphNode( + name=node.id, + type=node.type, + properties=node.properties if hasattr(node, "properties") else {}, + ) + ) + + # 收集关系 + for rel in doc.relationships: + source_node = GraphNode( + name=rel.source.id, + type=rel.source.type, + ) + target_node = GraphNode( + name=rel.target.id, + type=rel.target.type, + ) + + edges.append( + GraphEdge( + source=rel.source.id, + target=rel.target.id, + relation_type=rel.type, + properties=rel.properties if hasattr(rel, "properties") else {}, + ) + ) + + triples.append( + Triple( + subject=source_node, + predicate=rel.type, + object=target_node, + ) + ) + + return ExtractionResult( + nodes=nodes, + edges=edges, + triples=triples, + raw_text=request.text, + source_id=request.source_id, + ) diff --git a/runtime/datamate-python/app/module/kg_extraction/models.py b/runtime/datamate-python/app/module/kg_extraction/models.py new file mode 100644 index 0000000..a6220ff --- /dev/null +++ b/runtime/datamate-python/app/module/kg_extraction/models.py @@ -0,0 +1,75 @@ +"""知识图谱三元组抽取数据模型。""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + + +class GraphNode(BaseModel): + """图谱节点(实体)。""" + + name: str = Field(..., description="实体名称") + type: str = Field(..., description="实体类型, 如 Person, Organization, Location") + properties: dict[str, object] = Field(default_factory=dict, description="扩展属性") + + +class GraphEdge(BaseModel): + """图谱边(关系)。""" + + source: str = Field(..., description="源实体名称") + target: str = Field(..., description="目标实体名称") + relation_type: str = Field(..., description="关系类型, 如 works_at, located_in") + properties: dict[str, object] = Field(default_factory=dict, description="关系属性") + + +class Triple(BaseModel): + """知识三元组: (主体, 关系, 客体)。""" + + subject: GraphNode + predicate: str = Field(..., description="关系类型") + object: GraphNode + + +class EntityTypeConstraint(BaseModel): + """实体类型约束,用于 Schema-guided 抽取。""" + + name: str = Field(..., description="类型名称") + description: str = Field(default="", description="类型说明") + + +class RelationTypeConstraint(BaseModel): + """关系类型约束。""" + + name: str = Field(..., description="关系类型名称") + source_types: list[str] = Field(default_factory=list, description="允许的源实体类型") + target_types: list[str] = Field(default_factory=list, description="允许的目标实体类型") + description: str = Field(default="", description="关系说明") + + +class ExtractionSchema(BaseModel): + """抽取 schema 约束,约束 LLM 输出的实体和关系类型范围。""" + + entity_types: list[EntityTypeConstraint] = Field(default_factory=list) + relation_types: list[RelationTypeConstraint] = Field(default_factory=list) + + +class ExtractionRequest(BaseModel): + """三元组抽取请求。""" + + text: str = Field(..., description="待抽取的文本") + graph_id: str = Field(..., description="目标图谱 ID") + schema: ExtractionSchema | None = Field( + default=None, description="可选的 schema 约束, 提供后做 schema-guided 抽取" + ) + source_id: str | None = Field(default=None, description="来源 ID(数据集/知识库条目)") + source_type: str = Field(default="KNOWLEDGE_BASE", description="来源类型") + + +class ExtractionResult(BaseModel): + """三元组抽取结果。""" + + nodes: list[GraphNode] = Field(default_factory=list) + edges: list[GraphEdge] = Field(default_factory=list) + triples: list[Triple] = Field(default_factory=list) + raw_text: str = Field(default="", description="原始文本") + source_id: str | None = None diff --git a/runtime/datamate-python/pyproject.toml b/runtime/datamate-python/pyproject.toml index 14f23d5..a83c23c 100644 --- a/runtime/datamate-python/pyproject.toml +++ b/runtime/datamate-python/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "openai (>=2.9.0,<3.0.0)", "langchain-openai (>=1.1.1,<2.0.0)", "langchain (>=1.1.3,<2.0.0)", + "langchain-experimental (>=0.3.0,<1.0.0)", "pydantic (>=2.12.5,<3.0.0)", "sqlalchemy (>=2.0.45,<3.0.0)", "fastapi (>=0.124.0,<0.125.0)",