You've already forked DataMate
Compare commits
2 Commits
a73571bd73
...
07a901043a
| Author | SHA1 | Date | |
|---|---|---|---|
| 07a901043a | |||
| 32e3fc97c6 |
@@ -11,7 +11,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from app.core.config import settings
|
||||
from app.core.logging import get_logger
|
||||
from app.db.models import Dataset, DatasetFiles, LabelingProject
|
||||
from app.module.annotation.service.text_fetcher import fetch_text_content_via_download_api
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -77,15 +76,18 @@ class KnowledgeSyncService:
|
||||
|
||||
if set_id:
|
||||
exists = await self._get_knowledge_set(set_id)
|
||||
if exists:
|
||||
if exists and self._metadata_matches_project(exists.get("metadata"), project.id):
|
||||
return set_id
|
||||
logger.warning("知识集不存在,准备重建:set_id=%s", set_id)
|
||||
logger.warning(
|
||||
"知识集不存在或归属不匹配,准备重建:set_id=%s project_id=%s",
|
||||
set_id,
|
||||
project.id,
|
||||
)
|
||||
|
||||
dataset_name = project.name or "annotation-project"
|
||||
base_name = dataset_name.strip() or "annotation-project"
|
||||
project_name = (project.name or "annotation-project").strip() or "annotation-project"
|
||||
metadata = self._build_set_metadata(project)
|
||||
|
||||
existing = await self._find_knowledge_set_by_name(base_name)
|
||||
existing = await self._find_knowledge_set_by_name_and_project(project_name, project.id)
|
||||
if existing:
|
||||
await self._update_project_config(
|
||||
project,
|
||||
@@ -96,19 +98,19 @@ class KnowledgeSyncService:
|
||||
)
|
||||
return existing.get("id")
|
||||
|
||||
created = await self._create_knowledge_set(base_name, metadata)
|
||||
created = await self._create_knowledge_set(project_name, metadata)
|
||||
if not created:
|
||||
created = await self._find_knowledge_set_by_name(base_name)
|
||||
created = await self._find_knowledge_set_by_name_and_project(project_name, project.id)
|
||||
|
||||
if not created:
|
||||
fallback_name = self._build_fallback_set_name(base_name, project.id)
|
||||
existing = await self._find_knowledge_set_by_name(fallback_name)
|
||||
fallback_name = self._build_fallback_set_name(project_name, project.id)
|
||||
existing = await self._find_knowledge_set_by_name_and_project(fallback_name, project.id)
|
||||
if existing:
|
||||
created = existing
|
||||
else:
|
||||
created = await self._create_knowledge_set(fallback_name, metadata)
|
||||
if not created:
|
||||
created = await self._find_knowledge_set_by_name(fallback_name)
|
||||
created = await self._find_knowledge_set_by_name_and_project(fallback_name, project.id)
|
||||
|
||||
if not created:
|
||||
return None
|
||||
@@ -153,16 +155,18 @@ class KnowledgeSyncService:
|
||||
return []
|
||||
return [item for item in content if isinstance(item, dict)]
|
||||
|
||||
async def _find_knowledge_set_by_name(self, name: str) -> Optional[Dict[str, Any]]:
|
||||
async def _find_knowledge_set_by_name_and_project(self, name: str, project_id: str) -> Optional[Dict[str, Any]]:
|
||||
if not name:
|
||||
return None
|
||||
items = await self._list_knowledge_sets(name)
|
||||
if not items:
|
||||
return None
|
||||
exact_matches = [item for item in items if item.get("name") == name]
|
||||
if not exact_matches:
|
||||
return None
|
||||
return exact_matches[0]
|
||||
for item in items:
|
||||
if item.get("name") != name:
|
||||
continue
|
||||
if self._metadata_matches_project(item.get("metadata"), project_id):
|
||||
return item
|
||||
return None
|
||||
|
||||
async def _create_knowledge_set(self, name: str, metadata: str) -> Optional[Dict[str, Any]]:
|
||||
payload = {
|
||||
@@ -249,16 +253,6 @@ class KnowledgeSyncService:
|
||||
content_type = "MARKDOWN"
|
||||
|
||||
content = annotation_json
|
||||
if dataset_type == "TEXT":
|
||||
try:
|
||||
content = await fetch_text_content_via_download_api(
|
||||
project.dataset_id,
|
||||
str(file_record.id),
|
||||
)
|
||||
content = self._append_annotation_to_content(content, annotation_json, content_type)
|
||||
except Exception as exc:
|
||||
logger.warning("读取文本失败,改为仅存标注JSON:%s", exc)
|
||||
content = annotation_json
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"title": title,
|
||||
@@ -289,13 +283,6 @@ class KnowledgeSyncService:
|
||||
extension = file_type
|
||||
return extension.lower() in {"md", "markdown"}
|
||||
|
||||
def _append_annotation_to_content(self, content: str, annotation_json: str, content_type: str) -> str:
|
||||
if content_type == "MARKDOWN":
|
||||
return (
|
||||
f"{content}\n\n---\n\n## 标注结果\n\n```json\n"
|
||||
f"{annotation_json}\n```")
|
||||
return f"{content}\n\n---\n\n标注结果(JSON):\n{annotation_json}"
|
||||
|
||||
def _strip_extension(self, file_name: str) -> str:
|
||||
if not file_name:
|
||||
return ""
|
||||
@@ -359,6 +346,27 @@ class KnowledgeSyncService:
|
||||
except Exception:
|
||||
return json.dumps({"error": "failed to serialize"}, ensure_ascii=False)
|
||||
|
||||
def _metadata_matches_project(self, metadata: Any, project_id: str) -> bool:
|
||||
if not project_id:
|
||||
return False
|
||||
parsed = self._parse_metadata(metadata)
|
||||
if not parsed:
|
||||
return False
|
||||
return str(parsed.get("project_id") or "").strip() == project_id
|
||||
|
||||
def _parse_metadata(self, metadata: Any) -> Optional[Dict[str, Any]]:
|
||||
if metadata is None:
|
||||
return None
|
||||
if isinstance(metadata, dict):
|
||||
return metadata
|
||||
if isinstance(metadata, str):
|
||||
try:
|
||||
payload = json.loads(metadata)
|
||||
except Exception:
|
||||
return None
|
||||
return payload if isinstance(payload, dict) else None
|
||||
return None
|
||||
|
||||
def _safe_response_text(self, response: httpx.Response) -> str:
|
||||
try:
|
||||
return response.text
|
||||
|
||||
Reference in New Issue
Block a user