fix: 修复知识库同步的并发控制、数据清理、文件事务和COCO导出问题

问题1 - 并发控制缺失:
- 在 _ensure_knowledge_set 方法中添加数据库行锁(with_for_update)
- 修改 _update_project_config 方法,使用行锁保护配置更新

问题3 - 数据清理机制缺失:
- 添加 _cleanup_knowledge_set_for_project 方法,项目删除时清理知识集
- 添加 _cleanup_knowledge_item_for_file 方法,文件删除时清理知识条目
- 在 delete_mapping 接口中调用清理方法

问题4 - 文件操作事务问题:
- 修改 uploadKnowledgeItems,添加事务失败后的文件清理逻辑
- 修改 deleteKnowledgeItem,删除记录前先删除关联文件
- 新增 deleteKnowledgeItemFile 辅助方法

问题5 - COCO导出格式问题:
- 添加 _get_image_dimensions 方法读取图片实际宽高
- 将百分比坐标转换为像素坐标
- 在 AnnotationExportItem 中添加 file_path 字段

涉及文件:
- knowledge_sync.py
- project.py
- KnowledgeItemApplicationService.java
- export.py
- export schema.py
This commit is contained in:
2026-02-05 03:55:01 +08:00
parent c03bdf1a24
commit 99bd83d312
5 changed files with 513 additions and 238 deletions

View File

@@ -43,7 +43,9 @@ class KnowledgeSyncService:
logger.warning("标注同步失败:无法获取知识集")
return
item = await self._get_item_by_source(set_id, project.dataset_id, str(file_record.id))
item = await self._get_item_by_source(
set_id, project.dataset_id, str(file_record.id)
)
if item and item.get("status") in {"PUBLISHED", "ARCHIVED", "DEPRECATED"}:
logger.info(
"知识条目为只读状态,跳过同步:item_id=%s status=%s",
@@ -71,26 +73,46 @@ class KnowledgeSyncService:
logger.warning("标注同步到知识管理失败:%s", exc)
async def _ensure_knowledge_set(self, project: LabelingProject) -> Optional[str]:
config = project.configuration if isinstance(project.configuration, dict) else {}
result = await self.db.execute(
select(LabelingProject)
.where(LabelingProject.id == project.id)
.with_for_update()
)
locked_project = result.scalar_one_or_none()
if not locked_project:
logger.warning("标注同步失败:无法锁定项目:project_id=%s", project.id)
return None
config = (
locked_project.configuration
if isinstance(locked_project.configuration, dict)
else {}
)
set_id = config.get(self.CONFIG_KEY_SET_ID)
if set_id:
exists = await self._get_knowledge_set(set_id)
if exists and self._metadata_matches_project(exists.get("metadata"), project.id):
if exists and self._metadata_matches_project(
exists.get("metadata"), locked_project.id
):
return set_id
logger.warning(
"知识集不存在或归属不匹配,准备重建:set_id=%s project_id=%s",
set_id,
project.id,
locked_project.id,
)
project_name = (project.name or "annotation-project").strip() or "annotation-project"
metadata = self._build_set_metadata(project)
project_name = (
locked_project.name or "annotation-project"
).strip() or "annotation-project"
metadata = self._build_set_metadata(locked_project)
existing = await self._find_knowledge_set_by_name_and_project(project_name, project.id)
existing = await self._find_knowledge_set_by_name_and_project(
project_name, locked_project.id
)
if existing:
await self._update_project_config(
project,
locked_project,
{
self.CONFIG_KEY_SET_ID: existing.get("id"),
self.CONFIG_KEY_SET_NAME: existing.get("name"),
@@ -100,23 +122,31 @@ class KnowledgeSyncService:
created = await self._create_knowledge_set(project_name, metadata)
if not created:
created = await self._find_knowledge_set_by_name_and_project(project_name, project.id)
created = await self._find_knowledge_set_by_name_and_project(
project_name, locked_project.id
)
if not created:
fallback_name = self._build_fallback_set_name(project_name, project.id)
existing = await self._find_knowledge_set_by_name_and_project(fallback_name, project.id)
fallback_name = self._build_fallback_set_name(
project_name, locked_project.id
)
existing = await self._find_knowledge_set_by_name_and_project(
fallback_name, locked_project.id
)
if existing:
created = existing
else:
created = await self._create_knowledge_set(fallback_name, metadata)
if not created:
created = await self._find_knowledge_set_by_name_and_project(fallback_name, project.id)
created = await self._find_knowledge_set_by_name_and_project(
fallback_name, locked_project.id
)
if not created:
return None
await self._update_project_config(
project,
locked_project,
{
self.CONFIG_KEY_SET_ID: created.get("id"),
self.CONFIG_KEY_SET_NAME: created.get("name"),
@@ -126,13 +156,17 @@ class KnowledgeSyncService:
async def _get_knowledge_set(self, set_id: str) -> Optional[Dict[str, Any]]:
try:
return await self._request("GET", f"/data-management/knowledge-sets/{set_id}")
return await self._request(
"GET", f"/data-management/knowledge-sets/{set_id}"
)
except httpx.HTTPStatusError as exc:
if exc.response.status_code == 404:
return None
raise
async def _list_knowledge_sets(self, keyword: Optional[str]) -> list[Dict[str, Any]]:
async def _list_knowledge_sets(
self, keyword: Optional[str]
) -> list[Dict[str, Any]]:
params: Dict[str, Any] = {
"page": 1,
"size": self.KNOWLEDGE_SET_LIST_SIZE,
@@ -140,7 +174,9 @@ class KnowledgeSyncService:
if keyword:
params["keyword"] = keyword
try:
data = await self._request("GET", "/data-management/knowledge-sets", params=params)
data = await self._request(
"GET", "/data-management/knowledge-sets", params=params
)
except httpx.HTTPStatusError as exc:
logger.warning(
"查询知识集失败:keyword=%s status=%s",
@@ -155,7 +191,9 @@ class KnowledgeSyncService:
return []
return [item for item in content if isinstance(item, dict)]
async def _find_knowledge_set_by_name_and_project(self, name: str, project_id: str) -> Optional[Dict[str, Any]]:
async def _find_knowledge_set_by_name_and_project(
self, name: str, project_id: str
) -> Optional[Dict[str, Any]]:
if not name:
return None
items = await self._list_knowledge_sets(name)
@@ -168,7 +206,9 @@ class KnowledgeSyncService:
return item
return None
async def _create_knowledge_set(self, name: str, metadata: str) -> Optional[Dict[str, Any]]:
async def _create_knowledge_set(
self, name: str, metadata: str
) -> Optional[Dict[str, Any]]:
payload = {
"name": name,
"description": "标注项目自动创建的知识集",
@@ -176,7 +216,9 @@ class KnowledgeSyncService:
"metadata": metadata,
}
try:
return await self._request("POST", "/data-management/knowledge-sets", json=payload)
return await self._request(
"POST", "/data-management/knowledge-sets", json=payload
)
except httpx.HTTPStatusError as exc:
logger.warning(
"创建知识集失败:name=%s status=%s detail=%s",
@@ -199,7 +241,9 @@ class KnowledgeSyncService:
"sourceFileId": file_id,
}
try:
data = await self._request("GET", f"/data-management/knowledge-sets/{set_id}/items", params=params)
data = await self._request(
"GET", f"/data-management/knowledge-sets/{set_id}/items", params=params
)
except httpx.HTTPStatusError as exc:
logger.warning(
"查询知识条目失败:set_id=%s status=%s",
@@ -216,9 +260,13 @@ class KnowledgeSyncService:
return content[0]
async def _create_item(self, set_id: str, payload: Dict[str, Any]) -> None:
await self._request("POST", f"/data-management/knowledge-sets/{set_id}/items", json=payload)
await self._request(
"POST", f"/data-management/knowledge-sets/{set_id}/items", json=payload
)
async def _update_item(self, set_id: str, item_id: str, payload: Dict[str, Any]) -> None:
async def _update_item(
self, set_id: str, item_id: str, payload: Dict[str, Any]
) -> None:
update_payload = dict(payload)
update_payload.pop("sourceDatasetId", None)
update_payload.pop("sourceFileId", None)
@@ -228,6 +276,62 @@ class KnowledgeSyncService:
json=update_payload,
)
async def _cleanup_knowledge_set_for_project(self, project_id: str) -> None:
"""清理项目关联的知识集及其所有知识条目"""
items = await self._list_knowledge_sets(None)
for item in items:
if self._metadata_matches_project(item.get("metadata"), project_id):
set_id = item.get("id")
if not set_id:
continue
try:
await self._request(
"DELETE", f"/data-management/knowledge-sets/{set_id}"
)
logger.info(
"已删除知识集:set_id=%s project_id=%s", set_id, project_id
)
except Exception as exc:
logger.warning(
"删除知识集失败:set_id=%s project_id=%s error=%s",
set_id,
project_id,
exc,
)
async def _cleanup_knowledge_item_for_file(
self, dataset_id: str, file_id: str
) -> None:
"""清理文件的知识条目"""
items = await self._list_knowledge_sets(None)
for set_item in items:
set_id = set_item.get("id")
if not set_id:
continue
item = await self._get_item_by_source(set_id, dataset_id, file_id)
if item and item.get("id"):
try:
await self._request(
"DELETE",
f"/data-management/knowledge-sets/{set_id}/items/{item['id']}",
)
logger.info(
"已删除知识条目:item_id=%s set_id=%s dataset_id=%s file_id=%s",
item.get("id"),
set_id,
dataset_id,
file_id,
)
except Exception as exc:
logger.warning(
"删除知识条目失败:item_id=%s set_id=%s dataset_id=%s file_id=%s error=%s",
item.get("id"),
set_id,
dataset_id,
file_id,
exc,
)
async def _build_item_payload(
self,
project: LabelingProject,
@@ -323,12 +427,28 @@ class KnowledgeSyncService:
short_id = project_id.replace("-", "")[:8]
return f"{base_name}-annotation-{short_id}"
async def _update_project_config(self, project: LabelingProject, updates: Dict[str, Any]) -> None:
config = project.configuration if isinstance(project.configuration, dict) else {}
async def _update_project_config(
self, project: LabelingProject, updates: Dict[str, Any]
) -> None:
result = await self.db.execute(
select(LabelingProject)
.where(LabelingProject.id == project.id)
.with_for_update()
)
locked_project = result.scalar_one_or_none()
if not locked_project:
logger.warning("更新项目配置失败:无法锁定项目:project_id=%s", project.id)
return
config = (
locked_project.configuration
if isinstance(locked_project.configuration, dict)
else {}
)
config.update(updates)
project.configuration = config
locked_project.configuration = config
await self.db.commit()
await self.db.refresh(project)
await self.db.refresh(locked_project)
async def _request(self, method: str, path: str, **kwargs) -> Any:
url = f"{self.base_url}{path}"