You've already forked DataMate
fix: 修复codex review发现的问题
问题1 - 行锁持有时间过长: - 采用双重检查锁定模式,将HTTP调用移到锁范围外 - 新增 _update_knowledge_set_config 方法专门处理加锁更新 问题2 - 清理不完整: - _list_knowledge_sets 方法添加分页参数 - 新增 _list_all_knowledge_sets 方法遍历所有知识集 - 清理方法使用新的全量查询方法 问题3 - 文件删除逻辑可能误删: - deleteKnowledgeItemFile 方法增加严格的 sourceType 检查 - 只有当 sourceType 为 FILE_UPLOAD 或 MANUAL 时才删除文件 - 避免误删 DATASET_FILE 类型的数据集文件 涉及文件: - knowledge_sync.py - KnowledgeItemApplicationService.java
This commit is contained in:
@@ -809,8 +809,8 @@ public class KnowledgeItemApplicationService {
|
|||||||
if (knowledgeItem == null) {
|
if (knowledgeItem == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (knowledgeItem.getSourceType() == KnowledgeSourceType.FILE_UPLOAD
|
KnowledgeSourceType sourceType = knowledgeItem.getSourceType();
|
||||||
|| knowledgeItem.getContentType() == KnowledgeContentType.FILE) {
|
if (sourceType == KnowledgeSourceType.FILE_UPLOAD || sourceType == KnowledgeSourceType.MANUAL) {
|
||||||
String relativePath = knowledgeItem.getContent();
|
String relativePath = knowledgeItem.getContent();
|
||||||
if (StringUtils.isNotBlank(relativePath)) {
|
if (StringUtils.isNotBlank(relativePath)) {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -73,65 +73,46 @@ class KnowledgeSyncService:
|
|||||||
logger.warning("标注同步到知识管理失败:%s", exc)
|
logger.warning("标注同步到知识管理失败:%s", exc)
|
||||||
|
|
||||||
async def _ensure_knowledge_set(self, project: LabelingProject) -> Optional[str]:
|
async def _ensure_knowledge_set(self, project: LabelingProject) -> Optional[str]:
|
||||||
result = await self.db.execute(
|
# 第一次检查:无锁查询配置
|
||||||
select(LabelingProject)
|
|
||||||
.where(LabelingProject.id == project.id)
|
|
||||||
.with_for_update()
|
|
||||||
)
|
|
||||||
locked_project = result.scalar_one_or_none()
|
|
||||||
if not locked_project:
|
|
||||||
logger.warning("标注同步失败:无法锁定项目:project_id=%s", project.id)
|
|
||||||
return None
|
|
||||||
|
|
||||||
config = (
|
config = (
|
||||||
locked_project.configuration
|
project.configuration if isinstance(project.configuration, dict) else {}
|
||||||
if isinstance(locked_project.configuration, dict)
|
|
||||||
else {}
|
|
||||||
)
|
)
|
||||||
set_id = config.get(self.CONFIG_KEY_SET_ID)
|
set_id = config.get(self.CONFIG_KEY_SET_ID)
|
||||||
|
|
||||||
if set_id:
|
if set_id:
|
||||||
exists = await self._get_knowledge_set(set_id)
|
exists = await self._get_knowledge_set(set_id)
|
||||||
if exists and self._metadata_matches_project(
|
if exists and self._metadata_matches_project(
|
||||||
exists.get("metadata"), locked_project.id
|
exists.get("metadata"), project.id
|
||||||
):
|
):
|
||||||
return set_id
|
return set_id
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"知识集不存在或归属不匹配,准备重建:set_id=%s project_id=%s",
|
"知识集不存在或归属不匹配,准备重建:set_id=%s project_id=%s",
|
||||||
set_id,
|
set_id,
|
||||||
locked_project.id,
|
project.id,
|
||||||
)
|
)
|
||||||
|
set_id = None
|
||||||
|
|
||||||
project_name = (
|
project_name = (
|
||||||
locked_project.name or "annotation-project"
|
project.name or "annotation-project"
|
||||||
).strip() or "annotation-project"
|
).strip() or "annotation-project"
|
||||||
metadata = self._build_set_metadata(locked_project)
|
metadata = self._build_set_metadata(project)
|
||||||
|
|
||||||
existing = await self._find_knowledge_set_by_name_and_project(
|
existing = await self._find_knowledge_set_by_name_and_project(
|
||||||
project_name, locked_project.id
|
project_name, project.id
|
||||||
)
|
)
|
||||||
if existing:
|
if existing:
|
||||||
await self._update_project_config(
|
return await self._update_knowledge_set_config(project, existing)
|
||||||
locked_project,
|
|
||||||
{
|
|
||||||
self.CONFIG_KEY_SET_ID: existing.get("id"),
|
|
||||||
self.CONFIG_KEY_SET_NAME: existing.get("name"),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
return existing.get("id")
|
|
||||||
|
|
||||||
created = await self._create_knowledge_set(project_name, metadata)
|
created = await self._create_knowledge_set(project_name, metadata)
|
||||||
if not created:
|
if not created:
|
||||||
created = await self._find_knowledge_set_by_name_and_project(
|
created = await self._find_knowledge_set_by_name_and_project(
|
||||||
project_name, locked_project.id
|
project_name, project.id
|
||||||
)
|
)
|
||||||
|
|
||||||
if not created:
|
if not created:
|
||||||
fallback_name = self._build_fallback_set_name(
|
fallback_name = self._build_fallback_set_name(project_name, project.id)
|
||||||
project_name, locked_project.id
|
|
||||||
)
|
|
||||||
existing = await self._find_knowledge_set_by_name_and_project(
|
existing = await self._find_knowledge_set_by_name_and_project(
|
||||||
fallback_name, locked_project.id
|
fallback_name, project.id
|
||||||
)
|
)
|
||||||
if existing:
|
if existing:
|
||||||
created = existing
|
created = existing
|
||||||
@@ -139,20 +120,13 @@ class KnowledgeSyncService:
|
|||||||
created = await self._create_knowledge_set(fallback_name, metadata)
|
created = await self._create_knowledge_set(fallback_name, metadata)
|
||||||
if not created:
|
if not created:
|
||||||
created = await self._find_knowledge_set_by_name_and_project(
|
created = await self._find_knowledge_set_by_name_and_project(
|
||||||
fallback_name, locked_project.id
|
fallback_name, project.id
|
||||||
)
|
)
|
||||||
|
|
||||||
if not created:
|
if not created:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
await self._update_project_config(
|
return await self._update_knowledge_set_config(project, created)
|
||||||
locked_project,
|
|
||||||
{
|
|
||||||
self.CONFIG_KEY_SET_ID: created.get("id"),
|
|
||||||
self.CONFIG_KEY_SET_NAME: created.get("name"),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
return created.get("id")
|
|
||||||
|
|
||||||
async def _get_knowledge_set(self, set_id: str) -> Optional[Dict[str, Any]]:
|
async def _get_knowledge_set(self, set_id: str) -> Optional[Dict[str, Any]]:
|
||||||
try:
|
try:
|
||||||
@@ -165,11 +139,14 @@ class KnowledgeSyncService:
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
async def _list_knowledge_sets(
|
async def _list_knowledge_sets(
|
||||||
self, keyword: Optional[str]
|
self,
|
||||||
|
keyword: Optional[str],
|
||||||
|
page: Optional[int] = None,
|
||||||
|
size: Optional[int] = None,
|
||||||
) -> list[Dict[str, Any]]:
|
) -> list[Dict[str, Any]]:
|
||||||
params: Dict[str, Any] = {
|
params: Dict[str, Any] = {
|
||||||
"page": 1,
|
"page": page if page is not None else 1,
|
||||||
"size": self.KNOWLEDGE_SET_LIST_SIZE,
|
"size": size if size is not None else self.KNOWLEDGE_SET_LIST_SIZE,
|
||||||
}
|
}
|
||||||
if keyword:
|
if keyword:
|
||||||
params["keyword"] = keyword
|
params["keyword"] = keyword
|
||||||
@@ -191,12 +168,29 @@ class KnowledgeSyncService:
|
|||||||
return []
|
return []
|
||||||
return [item for item in content if isinstance(item, dict)]
|
return [item for item in content if isinstance(item, dict)]
|
||||||
|
|
||||||
|
async def _list_all_knowledge_sets(
|
||||||
|
self, keyword: Optional[str] = None
|
||||||
|
) -> list[Dict[str, Any]]:
|
||||||
|
page = 1
|
||||||
|
all_items: list[Dict[str, Any]] = []
|
||||||
|
while True:
|
||||||
|
items = await self._list_knowledge_sets(
|
||||||
|
keyword, page=page, size=self.KNOWLEDGE_SET_LIST_SIZE
|
||||||
|
)
|
||||||
|
if not items:
|
||||||
|
break
|
||||||
|
all_items.extend(items)
|
||||||
|
if len(items) < self.KNOWLEDGE_SET_LIST_SIZE:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
return all_items
|
||||||
|
|
||||||
async def _find_knowledge_set_by_name_and_project(
|
async def _find_knowledge_set_by_name_and_project(
|
||||||
self, name: str, project_id: str
|
self, name: str, project_id: str
|
||||||
) -> Optional[Dict[str, Any]]:
|
) -> Optional[Dict[str, Any]]:
|
||||||
if not name:
|
if not name:
|
||||||
return None
|
return None
|
||||||
items = await self._list_knowledge_sets(name)
|
items = await self._list_all_knowledge_sets(name)
|
||||||
if not items:
|
if not items:
|
||||||
return None
|
return None
|
||||||
for item in items:
|
for item in items:
|
||||||
@@ -278,7 +272,7 @@ class KnowledgeSyncService:
|
|||||||
|
|
||||||
async def _cleanup_knowledge_set_for_project(self, project_id: str) -> None:
|
async def _cleanup_knowledge_set_for_project(self, project_id: str) -> None:
|
||||||
"""清理项目关联的知识集及其所有知识条目"""
|
"""清理项目关联的知识集及其所有知识条目"""
|
||||||
items = await self._list_knowledge_sets(None)
|
items = await self._list_all_knowledge_sets()
|
||||||
for item in items:
|
for item in items:
|
||||||
if self._metadata_matches_project(item.get("metadata"), project_id):
|
if self._metadata_matches_project(item.get("metadata"), project_id):
|
||||||
set_id = item.get("id")
|
set_id = item.get("id")
|
||||||
@@ -303,7 +297,7 @@ class KnowledgeSyncService:
|
|||||||
self, dataset_id: str, file_id: str
|
self, dataset_id: str, file_id: str
|
||||||
) -> None:
|
) -> None:
|
||||||
"""清理文件的知识条目"""
|
"""清理文件的知识条目"""
|
||||||
items = await self._list_knowledge_sets(None)
|
items = await self._list_all_knowledge_sets()
|
||||||
for set_item in items:
|
for set_item in items:
|
||||||
set_id = set_item.get("id")
|
set_id = set_item.get("id")
|
||||||
if not set_id:
|
if not set_id:
|
||||||
@@ -427,6 +421,45 @@ class KnowledgeSyncService:
|
|||||||
short_id = project_id.replace("-", "")[:8]
|
short_id = project_id.replace("-", "")[:8]
|
||||||
return f"{base_name}-annotation-{short_id}"
|
return f"{base_name}-annotation-{short_id}"
|
||||||
|
|
||||||
|
async def _update_knowledge_set_config(
|
||||||
|
self, project: LabelingProject, knowledge_set: Dict[str, Any]
|
||||||
|
) -> Optional[str]:
|
||||||
|
result = await self.db.execute(
|
||||||
|
select(LabelingProject)
|
||||||
|
.where(LabelingProject.id == project.id)
|
||||||
|
.with_for_update()
|
||||||
|
)
|
||||||
|
locked_project = result.scalar_one_or_none()
|
||||||
|
if not locked_project:
|
||||||
|
logger.warning("标注同步失败:无法锁定项目:project_id=%s", project.id)
|
||||||
|
return None
|
||||||
|
|
||||||
|
config = (
|
||||||
|
locked_project.configuration
|
||||||
|
if isinstance(locked_project.configuration, dict)
|
||||||
|
else {}
|
||||||
|
)
|
||||||
|
set_id = config.get(self.CONFIG_KEY_SET_ID)
|
||||||
|
|
||||||
|
if set_id:
|
||||||
|
logger.info(
|
||||||
|
"知识集配置已被其他进程更新:set_id=%s project_id=%s",
|
||||||
|
set_id,
|
||||||
|
locked_project.id,
|
||||||
|
)
|
||||||
|
return set_id
|
||||||
|
|
||||||
|
config.update(
|
||||||
|
{
|
||||||
|
self.CONFIG_KEY_SET_ID: knowledge_set.get("id"),
|
||||||
|
self.CONFIG_KEY_SET_NAME: knowledge_set.get("name"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
locked_project.configuration = config
|
||||||
|
await self.db.commit()
|
||||||
|
await self.db.refresh(locked_project)
|
||||||
|
return knowledge_set.get("id")
|
||||||
|
|
||||||
async def _update_project_config(
|
async def _update_project_config(
|
||||||
self, project: LabelingProject, updates: Dict[str, Any]
|
self, project: LabelingProject, updates: Dict[str, Any]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user