Files
DataMate/runtime/datamate-python/app/core/config.py
Jerry Yan 9b6ff59a11 feat(kg): 实现 Phase 3.3 性能优化
核心功能:
- Neo4j 索引优化(entityType, graphId, properties.name)
- Redis 缓存(Java 侧,3 个缓存区,TTL 可配置)
- LRU 缓存(Python 侧,KG + Embedding,线程安全)
- 细粒度缓存清除(graphId 前缀匹配)
- 失败路径缓存清除(finally 块)

新增文件(Java 侧,7 个):
- V2__PerformanceIndexes.java - Flyway 迁移,创建 3 个索引
- IndexHealthService.java - 索引健康监控
- RedisCacheConfig.java - Spring Cache + Redis 配置
- GraphCacheService.java - 缓存清除管理器
- CacheableIntegrationTest.java - 集成测试(10 tests)
- GraphCacheServiceTest.java - 单元测试(19 tests)
- V2__PerformanceIndexesTest.java, IndexHealthServiceTest.java

新增文件(Python 侧,2 个):
- cache.py - 内存 TTL+LRU 缓存(cachetools)
- test_cache.py - 单元测试(20 tests)

修改文件(Java 侧,9 个):
- GraphEntityService.java - 添加 @Cacheable,缓存清除
- GraphQueryService.java - 添加 @Cacheable(包含用户权限上下文)
- GraphRelationService.java - 添加缓存清除
- GraphSyncService.java - 添加缓存清除(finally 块,失败路径)
- KnowledgeGraphProperties.java - 添加 Cache 配置类
- application-knowledgegraph.yml - 添加 Redis 和缓存 TTL 配置
- GraphEntityServiceTest.java - 添加 verify(cacheService) 断言
- GraphRelationServiceTest.java - 添加 verify(cacheService) 断言
- GraphSyncServiceTest.java - 添加失败路径缓存清除测试

修改文件(Python 侧,5 个):
- kg_client.py - 集成缓存(fulltext_search, get_subgraph)
- interface.py - 添加 /cache/stats 和 /cache/clear 端点
- config.py - 添加缓存配置字段
- pyproject.toml - 添加 cachetools 依赖
- test_kg_client.py - 添加 _disable_cache fixture

安全修复(3 轮迭代):
- P0: 缓存 key 用户隔离(防止跨用户数据泄露)
- P1-1: 同步子步骤后的缓存清除(18 个方法)
- P1-2: 实体创建后的搜索缓存清除
- P1-3: 失败路径缓存清除(finally 块)
- P2-1: 细粒度缓存清除(graphId 前缀匹配,避免跨图谱冲刷)
- P2-2: 服务层测试添加 verify(cacheService) 断言

测试结果:
- Java: 280 tests pass  (270 → 280, +10 new)
- Python: 154 tests pass  (140 → 154, +14 new)

缓存配置:
- kg:entities - 实体缓存,TTL 1h
- kg:queries - 查询结果缓存,TTL 5min
- kg:search - 全文搜索缓存,TTL 3min
- KG cache (Python) - 256 entries, 5min TTL
- Embedding cache (Python) - 512 entries, 10min TTL
2026-02-20 18:28:33 +08:00

158 lines
5.7 KiB
Python

from pydantic_settings import BaseSettings
from pydantic import SecretStr, model_validator
from typing import Optional
import logging
import os
_logger = logging.getLogger(__name__)
# 已知的弱默认凭据,生产环境禁止使用
_BLOCKED_DEFAULT_PASSWORDS = {"password", "123456", "admin", "root", "datamate123"}
_BLOCKED_DEFAULT_TOKENS = {"abc123abc123", "EMPTY"}
class Settings(BaseSettings):
"""应用程序配置"""
class Config:
env_file = ".env"
case_sensitive = False
extra = 'ignore'
# Service
app_name: str = "DataMate Python Backend"
app_version: str = "1.0.0"
app_description: str = "Adapter for integrating Data Management System with Label Studio"
host: str = "0.0.0.0"
port: int = 18000
# CORS
# allowed_origins: List[str] = ["*"]
# allowed_methods: List[str] = ["*"]
# allowed_headers: List[str] = ["*"]
# Log
log_level: str = "INFO"
debug: bool = True
log_file_dir: str = "/var/log/datamate/backend-python"
# Database
mysql_host: str = "datamate-database"
mysql_port: int = 3306
mysql_user: str = "root"
mysql_password: str = "password"
mysql_database: str = "datamate"
database_url: str = "" # Will be overridden by build_database_url() if not provided
@model_validator(mode='after')
def build_database_url(self):
"""如果没有提供 database_url,则根据 MySQL 配置构建"""
if not self.database_url:
if self.mysql_password and self.mysql_user:
self.database_url = f"mysql+aiomysql://{self.mysql_user}:{self.mysql_password}@{self.mysql_host}:{self.mysql_port}/{self.mysql_database}"
else:
self.database_url = f"mysql+aiomysql://{self.mysql_host}:{self.mysql_port}/{self.mysql_database}"
return self
# Label Studio
label_studio_base_url: str = "http://label-studio:8000"
label_studio_username: Optional[str] = "admin@demo.com"
label_studio_password: Optional[str] = "demoadmin"
label_studio_user_token: Optional[str] = "abc123abc123" # Legacy Token
label_studio_local_document_root: str = "/label-studio/local" # Label Studio local file storage path
label_studio_file_path_prefix: str = "/data/local-files/?d=" # Label Studio local file serving URL prefix
ls_task_page_size: int = 1000
# DataMate
dm_file_path_prefix: str = "/dataset" # DM存储文件夹前缀
# DataMate Backend (Java) - 用于通过"下载/预览接口"读取文件内容
datamate_backend_base_url: str = "http://datamate-backend:8080/api"
# Knowledge Graph - LLM 三元组抽取配置
kg_llm_api_key: SecretStr = SecretStr("EMPTY")
kg_llm_base_url: Optional[str] = None
kg_llm_model: str = "gpt-4o-mini"
kg_llm_temperature: float = 0.0
kg_llm_timeout_seconds: int = 60
kg_llm_max_retries: int = 2
# Knowledge Graph - 实体对齐配置
kg_alignment_enabled: bool = False
kg_alignment_embedding_model: str = "text-embedding-3-small"
kg_alignment_vector_threshold: float = 0.92
kg_alignment_llm_threshold: float = 0.78
# GraphRAG 融合查询配置
graphrag_enabled: bool = False
graphrag_milvus_uri: str = "http://milvus-standalone:19530"
graphrag_kg_service_url: str = "http://datamate-kg:8080"
graphrag_kg_internal_token: str = ""
# GraphRAG - 检索策略默认值
graphrag_vector_top_k: int = 5
graphrag_graph_depth: int = 2
graphrag_graph_max_entities: int = 20
graphrag_vector_weight: float = 0.6
graphrag_graph_weight: float = 0.4
# GraphRAG - LLM(空则复用 kg_llm_* 配置)
graphrag_llm_model: str = ""
graphrag_llm_base_url: Optional[str] = None
graphrag_llm_api_key: SecretStr = SecretStr("EMPTY")
graphrag_llm_temperature: float = 0.1
graphrag_llm_timeout_seconds: int = 60
# GraphRAG - Embedding(空则复用 kg_alignment_embedding_* 配置)
graphrag_embedding_model: str = ""
# GraphRAG - 缓存配置
graphrag_cache_enabled: bool = True
graphrag_cache_kg_maxsize: int = 256
graphrag_cache_kg_ttl: int = 300
graphrag_cache_embedding_maxsize: int = 512
graphrag_cache_embedding_ttl: int = 600
# 标注编辑器(Label Studio Editor)相关
editor_max_text_bytes: int = 0 # <=0 表示不限制,正数为最大字节数
@model_validator(mode='after')
def check_default_credentials(self):
"""生产环境下检测弱默认凭据,拒绝启动。
通过环境变量 DATAMATE_ENV 判断环境:
- dev/test/local: 仅发出警告
- 其他(prod/staging 等): 抛出异常阻止启动
"""
env = os.environ.get("DATAMATE_ENV", "dev").lower()
is_dev = env in ("dev", "test", "local", "development")
issues: list[str] = []
if self.mysql_password in _BLOCKED_DEFAULT_PASSWORDS:
issues.append(f"mysql_password is set to a weak default ('{self.mysql_password}')")
if self.label_studio_password and self.label_studio_password in _BLOCKED_DEFAULT_PASSWORDS:
issues.append("label_studio_password is set to a weak default")
if self.label_studio_user_token and self.label_studio_user_token in _BLOCKED_DEFAULT_TOKENS:
issues.append("label_studio_user_token is set to a weak default")
if issues:
msg = "SECURITY: Weak default credentials detected: " + "; ".join(issues)
if is_dev:
_logger.warning(msg + " (acceptable in dev/test, MUST change for production)")
else:
raise ValueError(
msg + ". Set proper credentials via environment variables "
"before deploying to production."
)
return self
# 全局设置实例
settings = Settings()