You've already forked DataMate
Compare commits
2 Commits
a73571bd73
...
07a901043a
| Author | SHA1 | Date | |
|---|---|---|---|
| 07a901043a | |||
| 32e3fc97c6 |
@@ -11,7 +11,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
from app.core.logging import get_logger
|
from app.core.logging import get_logger
|
||||||
from app.db.models import Dataset, DatasetFiles, LabelingProject
|
from app.db.models import Dataset, DatasetFiles, LabelingProject
|
||||||
from app.module.annotation.service.text_fetcher import fetch_text_content_via_download_api
|
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
@@ -77,15 +76,18 @@ class KnowledgeSyncService:
|
|||||||
|
|
||||||
if set_id:
|
if set_id:
|
||||||
exists = await self._get_knowledge_set(set_id)
|
exists = await self._get_knowledge_set(set_id)
|
||||||
if exists:
|
if exists and self._metadata_matches_project(exists.get("metadata"), project.id):
|
||||||
return set_id
|
return set_id
|
||||||
logger.warning("知识集不存在,准备重建:set_id=%s", set_id)
|
logger.warning(
|
||||||
|
"知识集不存在或归属不匹配,准备重建:set_id=%s project_id=%s",
|
||||||
|
set_id,
|
||||||
|
project.id,
|
||||||
|
)
|
||||||
|
|
||||||
dataset_name = project.name or "annotation-project"
|
project_name = (project.name or "annotation-project").strip() or "annotation-project"
|
||||||
base_name = dataset_name.strip() or "annotation-project"
|
|
||||||
metadata = self._build_set_metadata(project)
|
metadata = self._build_set_metadata(project)
|
||||||
|
|
||||||
existing = await self._find_knowledge_set_by_name(base_name)
|
existing = await self._find_knowledge_set_by_name_and_project(project_name, project.id)
|
||||||
if existing:
|
if existing:
|
||||||
await self._update_project_config(
|
await self._update_project_config(
|
||||||
project,
|
project,
|
||||||
@@ -96,19 +98,19 @@ class KnowledgeSyncService:
|
|||||||
)
|
)
|
||||||
return existing.get("id")
|
return existing.get("id")
|
||||||
|
|
||||||
created = await self._create_knowledge_set(base_name, metadata)
|
created = await self._create_knowledge_set(project_name, metadata)
|
||||||
if not created:
|
if not created:
|
||||||
created = await self._find_knowledge_set_by_name(base_name)
|
created = await self._find_knowledge_set_by_name_and_project(project_name, project.id)
|
||||||
|
|
||||||
if not created:
|
if not created:
|
||||||
fallback_name = self._build_fallback_set_name(base_name, project.id)
|
fallback_name = self._build_fallback_set_name(project_name, project.id)
|
||||||
existing = await self._find_knowledge_set_by_name(fallback_name)
|
existing = await self._find_knowledge_set_by_name_and_project(fallback_name, project.id)
|
||||||
if existing:
|
if existing:
|
||||||
created = existing
|
created = existing
|
||||||
else:
|
else:
|
||||||
created = await self._create_knowledge_set(fallback_name, metadata)
|
created = await self._create_knowledge_set(fallback_name, metadata)
|
||||||
if not created:
|
if not created:
|
||||||
created = await self._find_knowledge_set_by_name(fallback_name)
|
created = await self._find_knowledge_set_by_name_and_project(fallback_name, project.id)
|
||||||
|
|
||||||
if not created:
|
if not created:
|
||||||
return None
|
return None
|
||||||
@@ -153,16 +155,18 @@ class KnowledgeSyncService:
|
|||||||
return []
|
return []
|
||||||
return [item for item in content if isinstance(item, dict)]
|
return [item for item in content if isinstance(item, dict)]
|
||||||
|
|
||||||
async def _find_knowledge_set_by_name(self, name: str) -> Optional[Dict[str, Any]]:
|
async def _find_knowledge_set_by_name_and_project(self, name: str, project_id: str) -> Optional[Dict[str, Any]]:
|
||||||
if not name:
|
if not name:
|
||||||
return None
|
return None
|
||||||
items = await self._list_knowledge_sets(name)
|
items = await self._list_knowledge_sets(name)
|
||||||
if not items:
|
if not items:
|
||||||
return None
|
return None
|
||||||
exact_matches = [item for item in items if item.get("name") == name]
|
for item in items:
|
||||||
if not exact_matches:
|
if item.get("name") != name:
|
||||||
return None
|
continue
|
||||||
return exact_matches[0]
|
if self._metadata_matches_project(item.get("metadata"), project_id):
|
||||||
|
return item
|
||||||
|
return None
|
||||||
|
|
||||||
async def _create_knowledge_set(self, name: str, metadata: str) -> Optional[Dict[str, Any]]:
|
async def _create_knowledge_set(self, name: str, metadata: str) -> Optional[Dict[str, Any]]:
|
||||||
payload = {
|
payload = {
|
||||||
@@ -249,16 +253,6 @@ class KnowledgeSyncService:
|
|||||||
content_type = "MARKDOWN"
|
content_type = "MARKDOWN"
|
||||||
|
|
||||||
content = annotation_json
|
content = annotation_json
|
||||||
if dataset_type == "TEXT":
|
|
||||||
try:
|
|
||||||
content = await fetch_text_content_via_download_api(
|
|
||||||
project.dataset_id,
|
|
||||||
str(file_record.id),
|
|
||||||
)
|
|
||||||
content = self._append_annotation_to_content(content, annotation_json, content_type)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("读取文本失败,改为仅存标注JSON:%s", exc)
|
|
||||||
content = annotation_json
|
|
||||||
|
|
||||||
payload: Dict[str, Any] = {
|
payload: Dict[str, Any] = {
|
||||||
"title": title,
|
"title": title,
|
||||||
@@ -289,13 +283,6 @@ class KnowledgeSyncService:
|
|||||||
extension = file_type
|
extension = file_type
|
||||||
return extension.lower() in {"md", "markdown"}
|
return extension.lower() in {"md", "markdown"}
|
||||||
|
|
||||||
def _append_annotation_to_content(self, content: str, annotation_json: str, content_type: str) -> str:
|
|
||||||
if content_type == "MARKDOWN":
|
|
||||||
return (
|
|
||||||
f"{content}\n\n---\n\n## 标注结果\n\n```json\n"
|
|
||||||
f"{annotation_json}\n```")
|
|
||||||
return f"{content}\n\n---\n\n标注结果(JSON):\n{annotation_json}"
|
|
||||||
|
|
||||||
def _strip_extension(self, file_name: str) -> str:
|
def _strip_extension(self, file_name: str) -> str:
|
||||||
if not file_name:
|
if not file_name:
|
||||||
return ""
|
return ""
|
||||||
@@ -359,6 +346,27 @@ class KnowledgeSyncService:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return json.dumps({"error": "failed to serialize"}, ensure_ascii=False)
|
return json.dumps({"error": "failed to serialize"}, ensure_ascii=False)
|
||||||
|
|
||||||
|
def _metadata_matches_project(self, metadata: Any, project_id: str) -> bool:
|
||||||
|
if not project_id:
|
||||||
|
return False
|
||||||
|
parsed = self._parse_metadata(metadata)
|
||||||
|
if not parsed:
|
||||||
|
return False
|
||||||
|
return str(parsed.get("project_id") or "").strip() == project_id
|
||||||
|
|
||||||
|
def _parse_metadata(self, metadata: Any) -> Optional[Dict[str, Any]]:
|
||||||
|
if metadata is None:
|
||||||
|
return None
|
||||||
|
if isinstance(metadata, dict):
|
||||||
|
return metadata
|
||||||
|
if isinstance(metadata, str):
|
||||||
|
try:
|
||||||
|
payload = json.loads(metadata)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return payload if isinstance(payload, dict) else None
|
||||||
|
return None
|
||||||
|
|
||||||
def _safe_response_text(self, response: httpx.Response) -> str:
|
def _safe_response_text(self, response: httpx.Response) -> str:
|
||||||
try:
|
try:
|
||||||
return response.text
|
return response.text
|
||||||
|
|||||||
Reference in New Issue
Block a user