You've already forked DataMate
feat(annotation): 添加文本项目源文档过滤功能
- 在 EditorProjectInfo 中增加 datasetType 字段 - 移除前端硬编码的源文档扩展名列表 - 添加 isTextProject 判断逻辑 - 实现 prefetch 和 loadTasks 中的源文档排除参数 - 在后端接口中添加 excludeSourceDocuments 参数 - 实现源文档类型的数据库查询过滤逻辑 - 优化任务列表加载性能,避免不必要的源文档加载
This commit is contained in:
@@ -18,7 +18,7 @@ import hashlib
|
||||
import json
|
||||
import xml.etree.ElementTree as ET
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy import case, func, select
|
||||
from sqlalchemy import case, func, select, or_
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.config import settings
|
||||
@@ -69,6 +69,8 @@ SUPPORTED_EDITOR_DATASET_TYPES = (
|
||||
DATASET_TYPE_VIDEO,
|
||||
)
|
||||
SEGMENTATION_ENABLED_KEY = "segmentation_enabled"
|
||||
SOURCE_DOCUMENT_EXTENSIONS = (".pdf", ".doc", ".docx")
|
||||
SOURCE_DOCUMENT_TYPES = ("pdf", "doc", "docx")
|
||||
|
||||
|
||||
class AnnotationEditorService:
|
||||
@@ -346,6 +348,14 @@ class AnnotationEditorService:
|
||||
|
||||
return ET.tostring(root, encoding="unicode")
|
||||
|
||||
@classmethod
|
||||
def _build_source_document_filter(cls):
|
||||
file_type_lower = func.lower(DatasetFiles.file_type)
|
||||
file_name_lower = func.lower(DatasetFiles.file_name)
|
||||
type_condition = file_type_lower.in_(SOURCE_DOCUMENT_TYPES)
|
||||
name_conditions = [file_name_lower.like(f"%{ext}") for ext in SOURCE_DOCUMENT_EXTENSIONS]
|
||||
return or_(type_condition, *name_conditions)
|
||||
|
||||
def _build_task_data(
|
||||
self,
|
||||
display_text: str,
|
||||
@@ -404,18 +414,35 @@ class AnnotationEditorService:
|
||||
return EditorProjectInfo(
|
||||
projectId=project.id,
|
||||
datasetId=project.dataset_id,
|
||||
datasetType=dataset_type or None,
|
||||
templateId=project.template_id,
|
||||
labelConfig=label_config,
|
||||
supported=supported,
|
||||
unsupportedReason=unsupported_reason,
|
||||
)
|
||||
|
||||
async def list_tasks(self, project_id: str, page: int = 0, size: int = 50) -> EditorTaskListResponse:
|
||||
async def list_tasks(
|
||||
self,
|
||||
project_id: str,
|
||||
page: int = 0,
|
||||
size: int = 50,
|
||||
exclude_source_documents: Optional[bool] = None,
|
||||
) -> EditorTaskListResponse:
|
||||
project = await self._get_project_or_404(project_id)
|
||||
dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
|
||||
should_exclude_source_documents = False
|
||||
if dataset_type == DATASET_TYPE_TEXT:
|
||||
should_exclude_source_documents = (
|
||||
exclude_source_documents if exclude_source_documents is not None else True
|
||||
)
|
||||
|
||||
base_conditions = [DatasetFiles.dataset_id == project.dataset_id]
|
||||
if should_exclude_source_documents:
|
||||
base_conditions.append(~self._build_source_document_filter())
|
||||
|
||||
count_result = await self.db.execute(
|
||||
select(func.count()).select_from(DatasetFiles).where(
|
||||
DatasetFiles.dataset_id == project.dataset_id
|
||||
*base_conditions
|
||||
)
|
||||
)
|
||||
total = int(count_result.scalar() or 0)
|
||||
@@ -431,7 +458,7 @@ class AnnotationEditorService:
|
||||
(AnnotationResult.file_id == DatasetFiles.id)
|
||||
& (AnnotationResult.project_id == project_id),
|
||||
)
|
||||
.where(DatasetFiles.dataset_id == project.dataset_id)
|
||||
.where(*base_conditions)
|
||||
.order_by(annotated_sort_key.asc(), DatasetFiles.created_at.desc())
|
||||
.offset(page * size)
|
||||
.limit(size)
|
||||
|
||||
Reference in New Issue
Block a user