// ============================================================================= // DataMate 知识图谱 - Neo4j Schema 初始化脚本 // Schema 版本:1.0.0 // 更新日期:2026-02-17 // // 使用方式: // 1. 通过 Cypher Shell 执行: // cat schema.cypher | cypher-shell -u neo4j -p // 2. 或在 Neo4j Browser 中逐段执行 // // 注意: // - 所有索引和约束使用 IF NOT EXISTS,可重复执行 // - 约束自动创建对应索引,无需重复创建 // - 关系属性索引需要 Neo4j Enterprise Edition,社区版使用属性内联匹配 // ============================================================================= // ============================================================================= // 第 1 部分:节点约束 // ============================================================================= // Entity 节点 ID 唯一性约束(自动创建索引) CREATE CONSTRAINT entity_id_unique IF NOT EXISTS FOR (n:Entity) REQUIRE n.id IS UNIQUE; // ============================================================================= // 第 2 部分:节点索引 // ============================================================================= // graph_id 索引 —— 多租户隔离的核心索引,所有查询都会带上 graph_id CREATE INDEX entity_graph_id IF NOT EXISTS FOR (n:Entity) ON (n.graph_id); // type 索引 —— 按实体类型过滤 CREATE INDEX entity_type IF NOT EXISTS FOR (n:Entity) ON (n.type); // name 索引 —— 按名称搜索 CREATE INDEX entity_name IF NOT EXISTS FOR (n:Entity) ON (n.name); // source_id 索引 —— MySQL → Neo4j 同步时按源 ID 查找 CREATE INDEX entity_source_id IF NOT EXISTS FOR (n:Entity) ON (n.source_id); // 复合索引:(graph_id, type) —— 查询某图谱内指定类型的实体 CREATE INDEX entity_graph_id_type IF NOT EXISTS FOR (n:Entity) ON (n.graph_id, n.type); // 复合索引:(graph_id, id) —— 精确查找实体(最常用查询路径) CREATE INDEX entity_graph_id_id IF NOT EXISTS FOR (n:Entity) ON (n.graph_id, n.id); // 复合索引:(graph_id, source_id) —— 同步时按源 ID 查找 CREATE INDEX entity_graph_id_source_id IF NOT EXISTS FOR (n:Entity) ON (n.graph_id, n.source_id); // created_at 索引 —— 按创建时间排序 CREATE INDEX entity_created_at IF NOT EXISTS FOR (n:Entity) ON (n.created_at); // ============================================================================= // 第 3 部分:全文索引(用于模糊搜索) // ============================================================================= // Entity name + description 全文索引 CREATE FULLTEXT INDEX entity_fulltext IF NOT EXISTS FOR (n:Entity) ON EACH [n.name, n.description]; // ============================================================================= // 第 3.1 部分:SyncHistory 约束和索引(同步元数据节点) // ============================================================================= // (graph_id, sync_id) 唯一约束 —— 防止 syncId 碰撞产生重复记录 CREATE CONSTRAINT sync_history_graph_sync_unique IF NOT EXISTS FOR (h:SyncHistory) REQUIRE (h.graph_id, h.sync_id) IS UNIQUE; // (graph_id, started_at) 索引 —— 加速按时间范围查询同步历史 CREATE INDEX sync_history_graph_started IF NOT EXISTS FOR (h:SyncHistory) ON (h.graph_id, h.started_at); // (graph_id, status, started_at) 索引 —— 加速按状态+时间的过滤查询 CREATE INDEX sync_history_graph_status_started IF NOT EXISTS FOR (h:SyncHistory) ON (h.graph_id, h.status, h.started_at); // ============================================================================= // 第 4 部分:关系属性说明 // ============================================================================= // Neo4j 社区版不支持关系属性索引。 // 所有关系查询通过节点索引定位后,在关系上使用属性内联匹配: // -[r:RELATED_TO {graph_id: $graphId, relation_type: $type}]-> // // 如果使用 Neo4j Enterprise Edition,可取消以下注释创建关系索引: // // CREATE INDEX rel_graph_id IF NOT EXISTS // FOR ()-[r:RELATED_TO]-() ON (r.graph_id); // // CREATE INDEX rel_relation_type IF NOT EXISTS // FOR ()-[r:RELATED_TO]-() ON (r.relation_type); // // CREATE INDEX rel_id IF NOT EXISTS // FOR ()-[r:RELATED_TO]-() ON (r.id); // ============================================================================= // 第 5 部分:示例数据(可选,用于验证 Schema) // ============================================================================= // 以下示例数据使用固定的 graph_id,用于开发和测试环境。 // 生产环境中 graph_id 由应用层生成和管理。 // --- 创建示例组织 --- CREATE (org:Entity { id: '00000000-0000-0000-0000-000000000001', name: '数据工程部', type: 'Org', description: '负责数据采集、清洗和标注', graph_id: '11111111-1111-1111-1111-111111111111', source_type: 'MANUAL', confidence: 1.0, properties_json: '{"org_code":"DE","level":1,"member_count":15}', created_at: datetime() }); // --- 创建示例用户 --- CREATE (user:Entity { id: '00000000-0000-0000-0000-000000000002', name: '张三', type: 'User', graph_id: '11111111-1111-1111-1111-111111111111', source_type: 'SYNC', confidence: 1.0, properties_json: '{"username":"zhangsan","email":"zhangsan@example.com","role":"USER","enabled":true}', created_at: datetime() }); // --- 创建示例数据集(源) --- CREATE (ds1:Entity { id: '00000000-0000-0000-0000-000000000010', name: '用户行为日志-原始', type: 'Dataset', description: '原始用户行为埋点数据', graph_id: '11111111-1111-1111-1111-111111111111', source_id: '100', source_type: 'SYNC', confidence: 1.0, properties_json: '{"dataset_type":"TEXT","status":"ACTIVE","category":"用户行为","format":"JSON","record_count":2000000,"size_bytes":3221225472}', created_at: datetime() }); // --- 创建示例数据集(清洗后) --- CREATE (ds2:Entity { id: '00000000-0000-0000-0000-000000000011', name: '用户行为日志-清洗后', type: 'Dataset', description: '经过去重和格式标准化的用户行为数据', graph_id: '11111111-1111-1111-1111-111111111111', source_id: '101', source_type: 'SYNC', confidence: 1.0, properties_json: '{"dataset_type":"TEXT","status":"ACTIVE","category":"用户行为","format":"JSON","record_count":1500000,"size_bytes":2147483648,"version":1}', created_at: datetime() }); // --- 创建示例字段 --- CREATE (f1:Entity { id: '00000000-0000-0000-0000-000000000020', name: 'user_id', type: 'Field', description: '用户唯一标识符', graph_id: '11111111-1111-1111-1111-111111111111', source_type: 'SYNC', confidence: 1.0, properties_json: '{"data_type":"STRING","nullable":false,"is_primary_key":true}', created_at: datetime() }); CREATE (f2:Entity { id: '00000000-0000-0000-0000-000000000021', name: 'event_type', type: 'Field', description: '事件类型', graph_id: '11111111-1111-1111-1111-111111111111', source_type: 'SYNC', confidence: 1.0, properties_json: '{"data_type":"STRING","nullable":false,"sample_values":["click","view","purchase"]}', created_at: datetime() }); CREATE (f3:Entity { id: '00000000-0000-0000-0000-000000000022', name: 'user_id', type: 'Field', description: '用户唯一标识符(清洗后)', graph_id: '11111111-1111-1111-1111-111111111111', source_type: 'SYNC', confidence: 1.0, properties_json: '{"data_type":"STRING","nullable":false,"is_primary_key":true}', created_at: datetime() }); // --- 创建示例工作流 --- CREATE (wf:Entity { id: '00000000-0000-0000-0000-000000000030', name: '文本去重清洗管道', type: 'Workflow', description: 'SimHash去重 + 格式标准化 + 空值过滤', graph_id: '11111111-1111-1111-1111-111111111111', source_type: 'SYNC', confidence: 1.0, properties_json: '{"workflow_type":"CLEANING","status":"ACTIVE","version":"1.0","operator_count":3}', created_at: datetime() }); // --- 创建示例作业 --- CREATE (job:Entity { id: '00000000-0000-0000-0000-000000000040', name: '清洗作业-20260215-001', type: 'Job', description: '用户行为日志去重清洗', graph_id: '11111111-1111-1111-1111-111111111111', source_id: '500', source_type: 'SYNC', confidence: 1.0, properties_json: '{"job_type":"CLEANING","status":"COMPLETED","started_at":"2026-02-15T10:00:00","completed_at":"2026-02-15T10:35:00","duration_seconds":2100,"input_count":2000000,"output_count":1500000}', created_at: datetime() }); // --- 创建示例标注任务 --- CREATE (lt:Entity { id: '00000000-0000-0000-0000-000000000050', name: '情感分析标注-批次1', type: 'LabelTask', description: '用户评论情感标注(正面/负面/中性)', graph_id: '11111111-1111-1111-1111-111111111111', source_id: '600', source_type: 'SYNC', confidence: 1.0, properties_json: '{"task_mode":"MANUAL","data_type":"text","labeling_type":"sentiment_analysis","status":"IN_PROGRESS","progress":30.0,"template_name":"情感分析"}', created_at: datetime() }); // --- 创建示例知识集 --- CREATE (ks:Entity { id: '00000000-0000-0000-0000-000000000060', name: '用户行为分析知识库', type: 'KnowledgeSet', description: '从用户行为数据中提取的业务规则和洞察', graph_id: '11111111-1111-1111-1111-111111111111', source_type: 'SYNC', confidence: 1.0, properties_json: '{"status":"PUBLISHED","domain":"用户行为","business_line":"数据分析","sensitivity":"INTERNAL","item_count":85}', created_at: datetime() }); // ============================================================================= // 第 6 部分:示例关系 // ============================================================================= // HAS_FIELD:源数据集 → 字段 MATCH (ds1:Entity {id: '00000000-0000-0000-0000-000000000010'}) MATCH (f1:Entity {id: '00000000-0000-0000-0000-000000000020'}) CREATE (ds1)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000001', relation_type: 'HAS_FIELD', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"ordinal":0,"required":true}', created_at: datetime() }]->(f1); MATCH (ds1:Entity {id: '00000000-0000-0000-0000-000000000010'}) MATCH (f2:Entity {id: '00000000-0000-0000-0000-000000000021'}) CREATE (ds1)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000002', relation_type: 'HAS_FIELD', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"ordinal":1,"required":true}', created_at: datetime() }]->(f2); // HAS_FIELD:清洗后数据集 → 字段 MATCH (ds2:Entity {id: '00000000-0000-0000-0000-000000000011'}) MATCH (f3:Entity {id: '00000000-0000-0000-0000-000000000022'}) CREATE (ds2)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000003', relation_type: 'HAS_FIELD', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"ordinal":0,"required":true}', created_at: datetime() }]->(f3); // DERIVED_FROM:清洗后数据集 → 源数据集 MATCH (ds2:Entity {id: '00000000-0000-0000-0000-000000000011'}) MATCH (ds1:Entity {id: '00000000-0000-0000-0000-000000000010'}) CREATE (ds2)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000004', relation_type: 'DERIVED_FROM', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"derivation_type":"CLEANING","job_id":"00000000-0000-0000-0000-000000000040","transformation":"SimHash去重 + 空值过滤"}', created_at: datetime() }]->(ds1); // TRIGGERS:工作流 → 作业 MATCH (wf:Entity {id: '00000000-0000-0000-0000-000000000030'}) MATCH (job:Entity {id: '00000000-0000-0000-0000-000000000040'}) CREATE (wf)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000005', relation_type: 'TRIGGERS', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"trigger_type":"MANUAL","triggered_at":"2026-02-15T10:00:00"}', created_at: datetime() }]->(job); // USES_DATASET:作业 → 源数据集 MATCH (job:Entity {id: '00000000-0000-0000-0000-000000000040'}) MATCH (ds1:Entity {id: '00000000-0000-0000-0000-000000000010'}) CREATE (job)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000006', relation_type: 'USES_DATASET', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"usage_role":"INPUT"}', created_at: datetime() }]->(ds1); // PRODUCES:作业 → 清洗后数据集 MATCH (job:Entity {id: '00000000-0000-0000-0000-000000000040'}) MATCH (ds2:Entity {id: '00000000-0000-0000-0000-000000000011'}) CREATE (job)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000007', relation_type: 'PRODUCES', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"output_type":"PRIMARY"}', created_at: datetime() }]->(ds2); // ASSIGNED_TO:标注任务 → 用户 MATCH (lt:Entity {id: '00000000-0000-0000-0000-000000000050'}) MATCH (user:Entity {id: '00000000-0000-0000-0000-000000000002'}) CREATE (lt)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000008', relation_type: 'ASSIGNED_TO', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"assigned_at":"2026-02-14T09:00:00","role":"EXECUTOR"}', created_at: datetime() }]->(user); // USES_DATASET:标注任务 → 清洗后数据集 MATCH (lt:Entity {id: '00000000-0000-0000-0000-000000000050'}) MATCH (ds2:Entity {id: '00000000-0000-0000-0000-000000000011'}) CREATE (lt)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000009', relation_type: 'USES_DATASET', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"usage_role":"INPUT"}', created_at: datetime() }]->(ds2); // BELONGS_TO:用户 → 组织 MATCH (user:Entity {id: '00000000-0000-0000-0000-000000000002'}) MATCH (org:Entity {id: '00000000-0000-0000-0000-000000000001'}) CREATE (user)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000010', relation_type: 'BELONGS_TO', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"membership_type":"PRIMARY","since":"2025-03-01T00:00:00"}', created_at: datetime() }]->(org); // BELONGS_TO:源数据集 → 组织 MATCH (ds1:Entity {id: '00000000-0000-0000-0000-000000000010'}) MATCH (org:Entity {id: '00000000-0000-0000-0000-000000000001'}) CREATE (ds1)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000011', relation_type: 'BELONGS_TO', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 1.0, source_id: '', properties_json: '{"membership_type":"PRIMARY"}', created_at: datetime() }]->(org); // IMPACTS:源字段 → 清洗后字段 MATCH (f1:Entity {id: '00000000-0000-0000-0000-000000000020'}) MATCH (f3:Entity {id: '00000000-0000-0000-0000-000000000022'}) CREATE (f1)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000012', relation_type: 'IMPACTS', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 0.95, source_id: '', properties_json: '{"impact_type":"DIRECT","job_id":"00000000-0000-0000-0000-000000000040"}', created_at: datetime() }]->(f3); // SOURCED_FROM:知识集 → 清洗后数据集 MATCH (ks:Entity {id: '00000000-0000-0000-0000-000000000060'}) MATCH (ds2:Entity {id: '00000000-0000-0000-0000-000000000011'}) CREATE (ks)-[:RELATED_TO { id: '00000000-0000-0000-0000-100000000013', relation_type: 'SOURCED_FROM', graph_id: '11111111-1111-1111-1111-111111111111', weight: 1.0, confidence: 0.85, source_id: '', properties_json: '{"extraction_method":"LLM","extracted_at":"2026-02-16T14:30:00","item_count":85}', created_at: datetime() }]->(ds2); // ============================================================================= // 第 7 部分:验证查询 // ============================================================================= // 验证节点数量 // MATCH (n:Entity {graph_id: '11111111-1111-1111-1111-111111111111'}) // RETURN n.type AS type, count(*) AS count // ORDER BY count DESC; // 验证关系数量 // MATCH (:Entity {graph_id: '11111111-1111-1111-1111-111111111111'}) // -[r:RELATED_TO {graph_id: '11111111-1111-1111-1111-111111111111'}]-> // (:Entity {graph_id: '11111111-1111-1111-1111-111111111111'}) // RETURN r.relation_type AS type, count(*) AS count // ORDER BY count DESC; // 验证端到端血缘 // MATCH path = (ds2:Entity {name: '用户行为日志-清洗后'}) // -[:RELATED_TO *1..5]-> // (origin:Entity) // WHERE ALL(r IN relationships(path) WHERE r.graph_id = '11111111-1111-1111-1111-111111111111') // RETURN path; // ============================================================================= // 第 8 部分:清理示例数据(可选) // ============================================================================= // 如需清理示例数据,执行以下语句: // MATCH (n:Entity {graph_id: '11111111-1111-1111-1111-111111111111'}) // DETACH DELETE n;