feat(annotation): 添加文本数据集段落切片功能

- 在前端组件中新增 segmentationEnabled 字段控制切片开关
- 为文本数据集添加段落切片配置选项,默认启用切片功能
- 在后端接口中新增 segmentation_enabled 参数传递给标注项目
- 实现切片逻辑控制,支持文本数据的自动段落分割
- 添加数据集类型判断,仅文本数据集支持切片配置
- 更新标注任务创建和编辑表单中的切片相关字段处理
This commit is contained in:
2026-01-26 12:05:21 +08:00
parent fa160164d2
commit 371df12a96
7 changed files with 218 additions and 32 deletions

View File

@@ -824,7 +824,7 @@ export default function LabelStudioTextEditor() {
/> />
</div> </div>
{segmented && ( {segmented && (
<div className="border-t border-gray-200 bg-white flex flex-col min-h-0"> <div className="flex-1 border-t border-gray-200 bg-white flex flex-col min-h-0">
<div className="px-3 py-2 border-b border-gray-200 bg-gray-50 font-medium text-sm flex items-center justify-between"> <div className="px-3 py-2 border-b border-gray-200 bg-gray-50 font-medium text-sm flex items-center justify-between">
<span>/</span> <span>/</span>
<Tag color="blue" style={{ margin: 0 }}> <Tag color="blue" style={{ margin: 0 }}>

View File

@@ -1,12 +1,13 @@
import { useEffect, useState } from "react"; import { useEffect, useMemo, useState } from "react";
import { Button, Input, Select, Form, message, Radio } from "antd"; import { Button, Input, Select, Form, message, Radio } from "antd";
import type { RadioChangeEvent } from "antd";
import TextArea from "antd/es/input/TextArea"; import TextArea from "antd/es/input/TextArea";
import { DatabaseOutlined } from "@ant-design/icons"; import { DatabaseOutlined } from "@ant-design/icons";
import { Link, useNavigate } from "react-router"; import { Link, useNavigate } from "react-router";
import { ArrowLeft } from "lucide-react"; import { ArrowLeft } from "lucide-react";
import { queryDatasetsUsingGet } from "../../DataManagement/dataset.api"; import { queryDatasetsUsingGet } from "../../DataManagement/dataset.api";
import { mapDataset } from "@/pages/DataManagement/dataset.const"; import { mapDataset } from "@/pages/DataManagement/dataset.const";
import type { Dataset } from "@/pages/DataManagement/dataset.model"; import { DatasetType, type Dataset } from "@/pages/DataManagement/dataset.model";
import { import {
createAnnotationTaskUsingPost, createAnnotationTaskUsingPost,
queryAnnotationTemplatesUsingGet, queryAnnotationTemplatesUsingGet,
@@ -14,20 +15,33 @@ import {
import type { AnnotationTemplate } from "../annotation.model"; import type { AnnotationTemplate } from "../annotation.model";
import TemplateConfigurationTreeEditor from "../components/TemplateConfigurationTreeEditor"; import TemplateConfigurationTreeEditor from "../components/TemplateConfigurationTreeEditor";
const DEFAULT_SEGMENTATION_ENABLED = true;
const SEGMENTATION_OPTIONS = [
{ label: "需要切片段", value: true },
{ label: "不需要切片段", value: false },
];
export default function AnnotationTaskCreate() { export default function AnnotationTaskCreate() {
const navigate = useNavigate(); const navigate = useNavigate();
const [form] = Form.useForm(); const [form] = Form.useForm();
const [datasets, setDatasets] = useState<Dataset[]>([]); const [datasets, setDatasets] = useState<Dataset[]>([]);
const [templates, setTemplates] = useState<AnnotationTemplate[]>([]); const [templates, setTemplates] = useState<AnnotationTemplate[]>([]);
const [selectedDatasetId, setSelectedDatasetId] = useState<string | null>(null);
const [labelConfig, setLabelConfig] = useState(""); const [labelConfig, setLabelConfig] = useState("");
const [configMode, setConfigMode] = useState<"template" | "custom">("template"); const [configMode, setConfigMode] = useState<"template" | "custom">("template");
const [submitting, setSubmitting] = useState(false); const [submitting, setSubmitting] = useState(false);
const selectedDataset = useMemo(
() => datasets.find((dataset) => dataset.id === selectedDatasetId),
[datasets, selectedDatasetId]
);
const isTextDataset = selectedDataset?.datasetType === DatasetType.TEXT;
const fetchDatasets = async () => { const fetchDatasets = async () => {
try { try {
const { data } = await queryDatasetsUsingGet({ page: 0, pageSize: 1000 }); const { data } = await queryDatasetsUsingGet({ page: 0, pageSize: 1000 });
const list = data?.content || []; const list = data?.content || [];
setDatasets(list.map((item: any) => mapDataset(item)) || []); setDatasets(list.map((item) => mapDataset(item)) || []);
} catch (error) { } catch (error) {
console.error("加载数据集失败:", error); console.error("加载数据集失败:", error);
message.error("加载数据集失败"); message.error("加载数据集失败");
@@ -62,7 +76,7 @@ export default function AnnotationTaskCreate() {
setLabelConfig(selectedTemplate?.labelConfig || ""); setLabelConfig(selectedTemplate?.labelConfig || "");
}; };
const handleConfigModeChange = (e: any) => { const handleConfigModeChange = (e: RadioChangeEvent) => {
const mode = e.target.value; const mode = e.target.value;
setConfigMode(mode); setConfigMode(mode);
if (mode === "custom") { if (mode === "custom") {
@@ -79,20 +93,26 @@ export default function AnnotationTaskCreate() {
} }
setSubmitting(true); setSubmitting(true);
await createAnnotationTaskUsingPost({ const requestData: Record<string, unknown> = {
name: values.name, name: values.name,
description: values.description, description: values.description,
datasetId: values.datasetId, datasetId: values.datasetId,
templateId: configMode === "template" ? values.templateId : undefined, templateId: configMode === "template" ? values.templateId : undefined,
labelConfig: labelConfig.trim(), labelConfig: labelConfig.trim(),
}); };
if (isTextDataset) {
requestData.segmentationEnabled =
values.segmentationEnabled ?? DEFAULT_SEGMENTATION_ENABLED;
}
await createAnnotationTaskUsingPost(requestData);
message.success("标注任务创建成功"); message.success("标注任务创建成功");
navigate("/data/annotation"); navigate("/data/annotation");
} catch (error: any) { } catch (error: unknown) {
if (error?.errorFields) { const err = error as { errorFields?: unknown; message?: string; data?: { message?: string } };
if (err?.errorFields) {
message.error("请完善必填信息"); message.error("请完善必填信息");
} else { } else {
const msg = error?.message || error?.data?.message || "创建失败,请稍后重试"; const msg = err?.message || err?.data?.message || "创建失败,请稍后重试";
message.error(msg); message.error(msg);
console.error(error); console.error(error);
} }
@@ -149,6 +169,40 @@ export default function AnnotationTaskCreate() {
), ),
value: dataset.id, value: dataset.id,
}))} }))}
onChange={(value) => {
setSelectedDatasetId(value);
const dataset = datasets.find((item) => item.id === value);
if (dataset?.datasetType === DatasetType.TEXT) {
const currentValue = form.getFieldValue("segmentationEnabled");
if (currentValue === undefined) {
form.setFieldsValue({
segmentationEnabled: DEFAULT_SEGMENTATION_ENABLED,
});
}
} else if (dataset) {
form.setFieldsValue({ segmentationEnabled: false });
}
}}
/>
</Form.Item>
<Form.Item
label="段落切片"
name="segmentationEnabled"
initialValue={DEFAULT_SEGMENTATION_ENABLED}
extra={
!selectedDatasetId
? "请选择数据集后配置"
: isTextDataset
? "仅文本数据集可配置该项"
: "非文本数据集不支持切片段"
}
>
<Radio.Group
options={SEGMENTATION_OPTIONS}
optionType="button"
buttonStyle="solid"
disabled={!isTextDataset}
/> />
</Form.Item> </Form.Item>

View File

@@ -1,8 +1,10 @@
import { queryDatasetsUsingGet, queryDatasetFilesUsingGet } from "@/pages/DataManagement/dataset.api"; import { queryDatasetsUsingGet, queryDatasetFilesUsingGet } from "@/pages/DataManagement/dataset.api";
import { mapDataset } from "@/pages/DataManagement/dataset.const"; import { mapDataset } from "@/pages/DataManagement/dataset.const";
import { App, Button, Form, Input, Modal, Select, Radio, Table } from "antd"; import { App, Button, Form, Input, Modal, Select, Radio, Table } from "antd";
import type { RadioChangeEvent } from "antd";
import TextArea from "antd/es/input/TextArea"; import TextArea from "antd/es/input/TextArea";
import { useEffect, useState } from "react"; import { useEffect, useMemo, useState } from "react";
import type { ReactNode } from "react";
import { Eye } from "lucide-react"; import { Eye } from "lucide-react";
import { import {
createAnnotationTaskUsingPost, createAnnotationTaskUsingPost,
@@ -10,7 +12,7 @@ import {
updateAnnotationTaskByIdUsingPut, updateAnnotationTaskByIdUsingPut,
queryAnnotationTemplatesUsingGet, queryAnnotationTemplatesUsingGet,
} from "../../annotation.api"; } from "../../annotation.api";
import { type Dataset } from "@/pages/DataManagement/dataset.model"; import { DatasetType, type Dataset } from "@/pages/DataManagement/dataset.model";
import type { AnnotationTemplate, AnnotationTask } from "../../annotation.model"; import type { AnnotationTemplate, AnnotationTask } from "../../annotation.model";
import LabelStudioEmbed from "@/components/business/LabelStudioEmbed"; import LabelStudioEmbed from "@/components/business/LabelStudioEmbed";
import TemplateConfigurationTreeEditor from "../../components/TemplateConfigurationTreeEditor"; import TemplateConfigurationTreeEditor from "../../components/TemplateConfigurationTreeEditor";
@@ -24,6 +26,38 @@ interface AnnotationTaskDialogProps {
editTask?: AnnotationTask | null; editTask?: AnnotationTask | null;
} }
type DatasetOption = Dataset & { icon?: ReactNode };
type DatasetPreviewFile = {
id: string;
fileName?: string;
fileSize?: number;
};
type AnnotationTaskDetail = {
name?: string;
description?: string;
datasetId?: string;
labelConfig?: string;
template?: { labelConfig?: string };
segmentationEnabled?: boolean;
};
type ApiResponse<T> = {
code?: number;
message?: string;
data?: T;
};
const isRecord = (value: unknown): value is Record<string, unknown> =>
!!value && typeof value === "object" && !Array.isArray(value);
const DEFAULT_SEGMENTATION_ENABLED = true;
const SEGMENTATION_OPTIONS = [
{ label: "需要切片段", value: true },
{ label: "不需要切片段", value: false },
];
export default function CreateAnnotationTask({ export default function CreateAnnotationTask({
open, open,
onClose, onClose,
@@ -33,19 +67,19 @@ export default function CreateAnnotationTask({
const isEditMode = !!editTask; const isEditMode = !!editTask;
const { message } = App.useApp(); const { message } = App.useApp();
const [manualForm] = Form.useForm(); const [manualForm] = Form.useForm();
const [datasets, setDatasets] = useState<Dataset[]>([]); const [datasets, setDatasets] = useState<DatasetOption[]>([]);
const [templates, setTemplates] = useState<AnnotationTemplate[]>([]); const [templates, setTemplates] = useState<AnnotationTemplate[]>([]);
const [submitting, setSubmitting] = useState(false); const [submitting, setSubmitting] = useState(false);
const [nameManuallyEdited, setNameManuallyEdited] = useState(false); const [nameManuallyEdited, setNameManuallyEdited] = useState(false);
const [labelConfig, setLabelConfig] = useState(""); const [labelConfig, setLabelConfig] = useState("");
const [showPreview, setShowPreview] = useState(false); const [showPreview, setShowPreview] = useState(false);
const [previewTaskData, setPreviewTaskData] = useState<Record<string, any>>({}); const [previewTaskData, setPreviewTaskData] = useState<Record<string, unknown>>({});
const [configMode, setConfigMode] = useState<"template" | "custom">("template"); const [configMode, setConfigMode] = useState<"template" | "custom">("template");
// 数据集预览相关状态 // 数据集预览相关状态
const [datasetPreviewVisible, setDatasetPreviewVisible] = useState(false); const [datasetPreviewVisible, setDatasetPreviewVisible] = useState(false);
const [datasetPreviewData, setDatasetPreviewData] = useState<any[]>([]); const [datasetPreviewData, setDatasetPreviewData] = useState<DatasetPreviewFile[]>([]);
const [datasetPreviewLoading, setDatasetPreviewLoading] = useState(false); const [datasetPreviewLoading, setDatasetPreviewLoading] = useState(false);
const [selectedDatasetId, setSelectedDatasetId] = useState<string | null>(null); const [selectedDatasetId, setSelectedDatasetId] = useState<string | null>(null);
@@ -61,6 +95,12 @@ export default function CreateAnnotationTask({
const [taskDetailLoading, setTaskDetailLoading] = useState(false); const [taskDetailLoading, setTaskDetailLoading] = useState(false);
const { config: tagConfig } = useTagConfig(false); const { config: tagConfig } = useTagConfig(false);
const selectedDataset = useMemo(
() => datasets.find((dataset) => dataset.id === selectedDatasetId),
[datasets, selectedDatasetId]
);
const isTextDataset = selectedDataset?.datasetType === DatasetType.TEXT;
useEffect(() => { useEffect(() => {
if (!open) return; if (!open) return;
const fetchData = async () => { const fetchData = async () => {
@@ -107,7 +147,7 @@ export default function CreateAnnotationTask({
// 编辑模式:加载任务详情 // 编辑模式:加载任务详情
setTaskDetailLoading(true); setTaskDetailLoading(true);
getAnnotationTaskByIdUsingGet(editTask.id) getAnnotationTaskByIdUsingGet(editTask.id)
.then((res: any) => { .then((res: ApiResponse<AnnotationTaskDetail>) => {
if (res.code === 200 && res.data) { if (res.code === 200 && res.data) {
const taskDetail = res.data; const taskDetail = res.data;
// 填充基本信息 // 填充基本信息
@@ -115,8 +155,13 @@ export default function CreateAnnotationTask({
name: taskDetail.name, name: taskDetail.name,
description: taskDetail.description, description: taskDetail.description,
datasetId: taskDetail.datasetId, datasetId: taskDetail.datasetId,
segmentationEnabled: typeof taskDetail.segmentationEnabled === "boolean"
? taskDetail.segmentationEnabled
: DEFAULT_SEGMENTATION_ENABLED,
}); });
setSelectedDatasetId(taskDetail.datasetId); if (taskDetail.datasetId) {
setSelectedDatasetId(taskDetail.datasetId);
}
// 获取实际的 labelConfig(优先使用任务自身的配置,回退到模板配置) // 获取实际的 labelConfig(优先使用任务自身的配置,回退到模板配置)
const configXml = taskDetail.labelConfig || taskDetail.template?.labelConfig; const configXml = taskDetail.labelConfig || taskDetail.template?.labelConfig;
@@ -140,6 +185,9 @@ export default function CreateAnnotationTask({
// 创建模式:重置为默认状态 // 创建模式:重置为默认状态
setConfigMode("template"); setConfigMode("template");
setSelectedDatasetId(null); setSelectedDatasetId(null);
manualForm.setFieldsValue({
segmentationEnabled: DEFAULT_SEGMENTATION_ENABLED,
});
} }
} }
}, [open, manualForm, isEditMode, editTask, message]); }, [open, manualForm, isEditMode, editTask, message]);
@@ -154,7 +202,7 @@ export default function CreateAnnotationTask({
try { try {
const res = await queryDatasetFilesUsingGet(selectedDatasetId, { page: 0, size: 10 }); const res = await queryDatasetFilesUsingGet(selectedDatasetId, { page: 0, size: 10 });
if (res.code === '0' && res.data) { if (res.code === '0' && res.data) {
setDatasetPreviewData(res.data.content || []); setDatasetPreviewData((res.data.content || []) as DatasetPreviewFile[]);
setDatasetPreviewVisible(true); setDatasetPreviewVisible(true);
} else { } else {
message.error("获取数据集预览失败"); message.error("获取数据集预览失败");
@@ -168,7 +216,7 @@ export default function CreateAnnotationTask({
}; };
// 预览文件内容 // 预览文件内容
const handlePreviewFileContent = async (file: any) => { const handlePreviewFileContent = async (file: DatasetPreviewFile) => {
const fileName = file.fileName?.toLowerCase() || ''; const fileName = file.fileName?.toLowerCase() || '';
// 文件类型扩展名映射 // 文件类型扩展名映射
@@ -318,7 +366,7 @@ export default function CreateAnnotationTask({
}; };
const generatePreviewTaskDataFromLabelConfig = (xml: string) => { const generatePreviewTaskDataFromLabelConfig = (xml: string) => {
const exampleDataByType: Record<string, any> = { const exampleDataByType: Record<string, unknown> = {
Image: "https://labelstud.io/images/opa-header.png", Image: "https://labelstud.io/images/opa-header.png",
Audio: "https://labelstud.io/files/sample.wav", Audio: "https://labelstud.io/files/sample.wav",
AudioPlus: "https://labelstud.io/files/sample.wav", AudioPlus: "https://labelstud.io/files/sample.wav",
@@ -350,7 +398,7 @@ export default function CreateAnnotationTask({
}; };
} }
const data: Record<string, any> = {}; const data: Record<string, unknown> = {};
objects.forEach((obj) => { objects.forEach((obj) => {
const name = obj.name || ""; const name = obj.name || "";
const value = obj.value || ""; const value = obj.value || "";
@@ -380,14 +428,16 @@ export default function CreateAnnotationTask({
}; };
// 当选择模板时,加载 XML 配置到树编辑器(仅快速填充) // 当选择模板时,加载 XML 配置到树编辑器(仅快速填充)
const handleTemplateSelect = (value: string, option: any) => { const handleTemplateSelect = (value: string, option: unknown) => {
if (!value) { if (!value) {
setLabelConfig(""); setLabelConfig("");
return; return;
} }
const selectedTemplate = templates.find((template) => template.id === value); const selectedTemplate = templates.find((template) => template.id === value);
const configXml = selectedTemplate?.labelConfig || option?.config || ""; const configXml = selectedTemplate?.labelConfig
|| (isRecord(option) && typeof option.config === "string" ? option.config : "")
|| "";
setLabelConfig(configXml); setLabelConfig(configXml);
}; };
@@ -437,6 +487,10 @@ export default function CreateAnnotationTask({
templateId: configMode === "template" ? values.templateId : undefined, templateId: configMode === "template" ? values.templateId : undefined,
labelConfig: labelConfig.trim(), labelConfig: labelConfig.trim(),
}; };
if (!isEditMode && isTextDataset) {
requestData.segmentationEnabled =
values.segmentationEnabled ?? DEFAULT_SEGMENTATION_ENABLED;
}
if (isEditMode && editTask) { if (isEditMode && editTask) {
// 编辑模式:调用更新接口 // 编辑模式:调用更新接口
@@ -449,16 +503,17 @@ export default function CreateAnnotationTask({
} }
onClose(); onClose();
onRefresh(); onRefresh();
} catch (err: any) { } catch (err: unknown) {
console.error(isEditMode ? "Update annotation task failed" : "Create annotation task failed", err); console.error(isEditMode ? "Update annotation task failed" : "Create annotation task failed", err);
const msg = err?.message || err?.data?.message || (isEditMode ? "更新失败,请稍后重试" : "创建失败,请稍后重试"); const error = err as { message?: string; data?: { message?: string } };
const msg = error?.message || error?.data?.message || (isEditMode ? "更新失败,请稍后重试" : "创建失败,请稍后重试");
message.error(msg); message.error(msg);
} finally { } finally {
setSubmitting(false); setSubmitting(false);
} }
}; };
const handleConfigModeChange = (e: any) => { const handleConfigModeChange = (e: RadioChangeEvent) => {
const mode = e.target.value; const mode = e.target.value;
setConfigMode(mode); setConfigMode(mode);
if (mode === "custom") { if (mode === "custom") {
@@ -521,7 +576,7 @@ export default function CreateAnnotationTask({
label: ( label: (
<div className="flex items-center justify-between gap-3 py-2"> <div className="flex items-center justify-between gap-3 py-2">
<div className="flex items-center font-sm text-gray-900"> <div className="flex items-center font-sm text-gray-900">
<span className="mr-2">{(dataset as any).icon}</span> <span className="mr-2">{dataset.icon}</span>
<span>{dataset.name}</span> <span>{dataset.name}</span>
</div> </div>
<div className="text-xs text-gray-500">{dataset.size}</div> <div className="text-xs text-gray-500">{dataset.size}</div>
@@ -532,6 +587,17 @@ export default function CreateAnnotationTask({
})} })}
onChange={(value) => { onChange={(value) => {
setSelectedDatasetId(value); setSelectedDatasetId(value);
const dataset = datasets.find((item) => item.id === value);
if (dataset?.datasetType === DatasetType.TEXT) {
const currentValue = manualForm.getFieldValue("segmentationEnabled");
if (currentValue === undefined) {
manualForm.setFieldsValue({
segmentationEnabled: DEFAULT_SEGMENTATION_ENABLED,
});
}
} else if (dataset) {
manualForm.setFieldsValue({ segmentationEnabled: false });
}
// 如果用户未手动修改名称,则用数据集名称作为默认任务名 // 如果用户未手动修改名称,则用数据集名称作为默认任务名
if (!nameManuallyEdited) { if (!nameManuallyEdited) {
const ds = datasets.find((d) => d.id === value); const ds = datasets.find((d) => d.id === value);
@@ -578,6 +644,28 @@ export default function CreateAnnotationTask({
<TextArea placeholder="(可选)详细描述标注任务的要求和目标" rows={2} /> <TextArea placeholder="(可选)详细描述标注任务的要求和目标" rows={2} />
</Form.Item> </Form.Item>
<Form.Item
label="段落切片"
name="segmentationEnabled"
initialValue={DEFAULT_SEGMENTATION_ENABLED}
extra={
isEditMode
? "编辑模式暂不支持修改"
: !selectedDatasetId
? "请选择数据集后配置"
: isTextDataset
? "仅文本数据集可配置该项"
: "非文本数据集不支持切片段"
}
>
<Radio.Group
options={SEGMENTATION_OPTIONS}
optionType="button"
buttonStyle="solid"
disabled={!isTextDataset || isEditMode}
/>
</Form.Item>
{/* 标注模板选择 */} {/* 标注模板选择 */}
<div className="flex items-center justify-between mb-2"> <div className="flex items-center justify-between mb-2">
<span className="text-sm font-medium text-gray-700 after:content-['*'] after:text-red-500 after:ml-1"></span> <span className="text-sm font-medium text-gray-700 after:content-['*'] after:text-red-500 after:ml-1"></span>
@@ -721,7 +809,7 @@ export default function CreateAnnotationTask({
dataIndex: "fileName", dataIndex: "fileName",
key: "fileName", key: "fileName",
ellipsis: true, ellipsis: true,
render: (text: string, record: any) => ( render: (text: string, record: DatasetPreviewFile) => (
<Button <Button
type="link" type="link"
size="small" size="small"

View File

@@ -25,6 +25,7 @@ router = APIRouter(
tags=["annotation/project"] tags=["annotation/project"]
) )
logger = get_logger(__name__) logger = get_logger(__name__)
TEXT_DATASET_TYPE = "TEXT"
@router.get("/{mapping_id}/login") @router.get("/{mapping_id}/login")
async def login_label_studio( async def login_label_studio(
@@ -62,6 +63,12 @@ async def create_mapping(
detail=f"Dataset not found in DM service: {request.dataset_id}" detail=f"Dataset not found in DM service: {request.dataset_id}"
) )
dataset_type = (
getattr(dataset_info, "datasetType", None)
or getattr(dataset_info, "dataset_type", None)
or ""
).upper()
project_name = request.name or \ project_name = request.name or \
dataset_info.name or \ dataset_info.name or \
"A new project from DataMate" "A new project from DataMate"
@@ -97,6 +104,8 @@ async def create_mapping(
project_configuration["label_config"] = label_config project_configuration["label_config"] = label_config
if project_description: if project_description:
project_configuration["description"] = project_description project_configuration["description"] = project_description
if dataset_type == TEXT_DATASET_TYPE and request.segmentation_enabled is not None:
project_configuration["segmentation_enabled"] = bool(request.segmentation_enabled)
labeling_project = LabelingProject( labeling_project = LabelingProject(
id=str(uuid.uuid4()), # Generate UUID here id=str(uuid.uuid4()), # Generate UUID here

View File

@@ -23,6 +23,11 @@ class DatasetMappingCreateRequest(BaseModel):
description: Optional[str] = Field(None, alias="description", description="标注项目描述") description: Optional[str] = Field(None, alias="description", description="标注项目描述")
template_id: Optional[str] = Field(None, alias="templateId", description="标注模板ID") template_id: Optional[str] = Field(None, alias="templateId", description="标注模板ID")
label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML配置") label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML配置")
segmentation_enabled: Optional[bool] = Field(
None,
alias="segmentationEnabled",
description="是否启用文本分段",
)
class Config: class Config:
# allow population by field name when constructing model programmatically # allow population by field name when constructing model programmatically
@@ -49,6 +54,11 @@ class DatasetMappingResponse(BaseModel):
template_id: Optional[str] = Field(None, alias="templateId", description="关联的模板ID") template_id: Optional[str] = Field(None, alias="templateId", description="关联的模板ID")
template: Optional['AnnotationTemplateResponse'] = Field(None, description="关联的标注模板详情") template: Optional['AnnotationTemplateResponse'] = Field(None, description="关联的标注模板详情")
label_config: Optional[str] = Field(None, alias="labelConfig", description="实际使用的 Label Studio XML 配置") label_config: Optional[str] = Field(None, alias="labelConfig", description="实际使用的 Label Studio XML 配置")
segmentation_enabled: Optional[bool] = Field(
None,
alias="segmentationEnabled",
description="是否启用文本分段",
)
total_count: int = Field(0, alias="totalCount", description="数据集总数据量") total_count: int = Field(0, alias="totalCount", description="数据集总数据量")
annotated_count: int = Field(0, alias="annotatedCount", description="已标注数据量") annotated_count: int = Field(0, alias="annotatedCount", description="已标注数据量")
created_at: datetime = Field(..., alias="createdAt", description="创建时间") created_at: datetime = Field(..., alias="createdAt", description="创建时间")

View File

@@ -56,6 +56,7 @@ TEXTUAL_OBJECT_CATEGORIES = {"text", "document"}
MEDIA_OBJECT_CATEGORIES = {"image"} MEDIA_OBJECT_CATEGORIES = {"image"}
OBJECT_NAME_HEADER_PREFIX = "dm_object_header_" OBJECT_NAME_HEADER_PREFIX = "dm_object_header_"
SUPPORTED_EDITOR_DATASET_TYPES = ("TEXT", "IMAGE") SUPPORTED_EDITOR_DATASET_TYPES = ("TEXT", "IMAGE")
SEGMENTATION_ENABLED_KEY = "segmentation_enabled"
class AnnotationEditorService: class AnnotationEditorService:
@@ -149,6 +150,18 @@ class AnnotationEditorService:
label_config = self._decorate_label_config_for_editor(label_config) label_config = self._decorate_label_config_for_editor(label_config)
return label_config return label_config
@staticmethod
def _resolve_segmentation_enabled(project: LabelingProject) -> bool:
config = project.configuration
if not isinstance(config, dict):
return True
value = config.get(SEGMENTATION_ENABLED_KEY)
if isinstance(value, bool):
return value
if value is None:
return True
return bool(value)
@classmethod @classmethod
def _resolve_primary_text_key(cls, label_config: Optional[str]) -> Optional[str]: def _resolve_primary_text_key(cls, label_config: Optional[str]) -> Optional[str]:
if not label_config: if not label_config:
@@ -513,13 +526,19 @@ class AnnotationEditorService:
ls_task_id = self._make_ls_task_id(project.id, file_id) ls_task_id = self._make_ls_task_id(project.id, file_id)
# 判断是否需要分段(JSONL 多行或主文本超过阈值) # 判断是否需要分段(JSONL 多行或主文本超过阈值)
needs_segmentation = len(records) > 1 or any( segmentation_enabled = self._resolve_segmentation_enabled(project)
len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts if not segmentation_enabled:
segment_index = None
needs_segmentation = segmentation_enabled and (
len(records) > 1 or any(len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts)
) )
segments: Optional[List[SegmentInfo]] = None segments: Optional[List[SegmentInfo]] = None
current_segment_index = 0 current_segment_index = 0
display_text = record_texts[0] if record_texts else text_content display_text = record_texts[0] if record_texts else text_content
selected_payload = records[0][0] if records else None selected_payload = records[0][0] if records else None
if not segmentation_enabled and len(records) > 1:
selected_payload = None
display_text = "\n".join(record_texts) if record_texts else text_content
segment_annotations: Dict[str, Any] = {} segment_annotations: Dict[str, Any] = {}
if ann and ann.annotation and ann.annotation.get("segmented"): if ann and ann.annotation and ann.annotation.get("segmented"):

View File

@@ -90,9 +90,11 @@ class DatasetMappingService:
configuration = getattr(mapping, 'configuration', None) or {} configuration = getattr(mapping, 'configuration', None) or {}
label_config = None label_config = None
description = None description = None
segmentation_enabled = None
if isinstance(configuration, dict): if isinstance(configuration, dict):
label_config = configuration.get('label_config') label_config = configuration.get('label_config')
description = configuration.get('description') description = configuration.get('description')
segmentation_enabled = configuration.get('segmentation_enabled')
# Optionally fetch full template details # Optionally fetch full template details
template_response = None template_response = None
@@ -117,6 +119,7 @@ class DatasetMappingService:
"template_id": template_id, "template_id": template_id,
"template": template_response, "template": template_response,
"label_config": label_config, "label_config": label_config,
"segmentation_enabled": segmentation_enabled,
"total_count": total_count, "total_count": total_count,
"annotated_count": annotated_count, "annotated_count": annotated_count,
"created_at": mapping.created_at, "created_at": mapping.created_at,
@@ -154,9 +157,11 @@ class DatasetMappingService:
configuration = getattr(mapping, 'configuration', None) or {} configuration = getattr(mapping, 'configuration', None) or {}
label_config = None label_config = None
description = None description = None
segmentation_enabled = None
if isinstance(configuration, dict): if isinstance(configuration, dict):
label_config = configuration.get('label_config') label_config = configuration.get('label_config')
description = configuration.get('description') description = configuration.get('description')
segmentation_enabled = configuration.get('segmentation_enabled')
# Optionally fetch full template details # Optionally fetch full template details
template_response = None template_response = None
@@ -184,6 +189,7 @@ class DatasetMappingService:
"template_id": template_id, "template_id": template_id,
"template": template_response, "template": template_response,
"label_config": label_config, "label_config": label_config,
"segmentation_enabled": segmentation_enabled,
"total_count": total_count, "total_count": total_count,
"annotated_count": annotated_count, "annotated_count": annotated_count,
"created_at": mapping.created_at, "created_at": mapping.created_at,