feature: multiple ratio configurations can be set for the data set. (#103)

feature: multiple ratio configurations can be set for the data set.
This commit is contained in:
hefanli
2025-11-24 15:28:17 +08:00
committed by GitHub
parent 497a5688e9
commit c1352ab91f
11 changed files with 258 additions and 229 deletions

View File

@@ -20,7 +20,7 @@ import java.util.Map;
@JsonIgnoreProperties(ignoreUnknown = true)
@JsonNaming(PropertyNamingStrategies.SnakeCaseStrategy.class)
public class FileTag {
private Map<String, Object> value;
private Map<String, Object> values;
private String type;
@@ -30,7 +30,7 @@ public class FileTag {
public List<String> getTags() {
List<String> tags = new ArrayList<>();
Object tagValues = value.get(type);
Object tagValues = values.get(type);
if (tagValues instanceof List) {
for (Object tag : (List<?>) tagValues) {
if (tag instanceof String) {

View File

@@ -38,7 +38,7 @@ export default function CreateRatioTask() {
const totals = String(values.totalTargetCount);
const config = ratioTaskForm.ratioConfigs.map((c) => {
return {
datasetId: c.id,
datasetId: c.source,
counts: String(c.quantity ?? 0),
filterConditions: { label: c.labelFilter, dateRange: String(c.dateRange ?? 0)},
};

View File

@@ -1,11 +1,16 @@
import React, { useMemo, useState } from "react";
import { Badge, Card, Input, Progress, Button, DatePicker, Select } from "antd";
import { BarChart3, Filter, Clock } from "lucide-react";
import React, { useMemo, useState, useEffect, FC } from "react";
import {
Badge,
Card,
Progress,
Button,
Select,
Table,
InputNumber,
Space,
} from "antd";
import { BarChart3, Filter } from "lucide-react";
import type { Dataset } from "@/pages/DataManagement/dataset.model.ts";
import dayjs from 'dayjs';
const { RangePicker } = DatePicker;
const { Option } = Select;
const TIME_RANGE_OPTIONS = [
{ label: '最近1天', value: 1 },
@@ -21,9 +26,9 @@ interface RatioConfigItem {
type: "dataset" | "label";
quantity: number;
percentage: number;
source: string;
source: string; // dataset id
labelFilter?: string;
dateRange?: string;
dateRange?: number;
}
interface RatioConfigProps {
@@ -35,169 +40,113 @@ interface RatioConfigProps {
onChange?: (configs: RatioConfigItem[]) => void;
}
const RatioConfig: React.FC<RatioConfigProps> = ({
ratioType,
selectedDatasets,
datasets,
totalTargetCount,
distributions,
onChange,
}) => {
const [ratioConfigs, setRatioConfigs] = useState<RatioConfigItem[]>([]);
const [datasetFilters, setDatasetFilters] = useState<Record<string, {
labelFilter?: string;
dateRange?: string;
}>>({});
const genId = (datasetId: string) =>
`${datasetId}-${Math.random().toString(36).slice(2, 9)}`;
const RatioConfig: FC<RatioConfigProps> = ({
ratioType,
selectedDatasets,
datasets,
totalTargetCount,
distributions,
onChange,
}) => {
const [ratioConfigs, setRatioConfigs] = useState<RatioConfigItem[]>([]);
// 配比项总数
const totalConfigured = useMemo(
() => ratioConfigs.reduce((sum, c) => sum + (c.quantity || 0), 0),
[ratioConfigs]
);
// 获取数据集的标签列表
const getDatasetLabels = (datasetId: string): string[] => {
const dist = distributions[String(datasetId)] || {};
return Object.keys(dist);
};
// 自动平均分配
const addConfig = (datasetId: string) => {
const dataset = datasets.find((d) => String(d.id) === datasetId);
const newConfig: RatioConfigItem = {
id: genId(datasetId),
name: dataset?.name || datasetId,
type: ratioType,
quantity: 0,
percentage: 0,
source: datasetId,
};
const newConfigs = [...ratioConfigs, newConfig];
setRatioConfigs(newConfigs);
onChange?.(newConfigs);
};
const removeConfig = (configId: string) => {
const newConfigs = ratioConfigs.filter((c) => c.id !== configId);
const adjusted = recomputePercentages(newConfigs);
setRatioConfigs(adjusted);
onChange?.(adjusted);
};
const updateConfig = (
configId: string,
updates: Partial<
Pick<RatioConfigItem, "quantity" | "labelFilter" | "dateRange">
>
) => {
const newConfigs = ratioConfigs.map((c) =>
c.id === configId ? { ...c, ...updates } : c
);
const adjusted = recomputePercentages(newConfigs);
setRatioConfigs(adjusted);
onChange?.(adjusted);
};
const recomputePercentages = (configs: RatioConfigItem[]) => {
return configs.map((c) => ({
...c,
percentage:
totalTargetCount > 0
? Math.round((c.quantity / totalTargetCount) * 100)
: 0,
}));
};
const generateAutoRatio = () => {
const selectedCount = selectedDatasets.length;
if (selectedCount === 0) return;
const baseQuantity = Math.floor(totalTargetCount / selectedCount);
const remainder = totalTargetCount % selectedCount;
const newConfigs = selectedDatasets.map((datasetId, index) => {
let newConfigs: RatioConfigItem[] = ratioConfigs.filter(
(c) => !selectedDatasets.includes(c.source)
);
selectedDatasets.forEach((datasetId, index) => {
const dataset = datasets.find((d) => String(d.id) === datasetId);
const quantity = baseQuantity + (index < remainder ? 1 : 0);
return {
id: datasetId,
const config: RatioConfigItem = {
id: genId(datasetId),
name: dataset?.name || datasetId,
type: ratioType,
quantity,
percentage: Math.round((quantity / totalTargetCount) * 100),
source: datasetId,
labelFilter: datasetFilters[datasetId]?.labelFilter,
dateRange: datasetFilters[datasetId]?.dateRange,
};
newConfigs.push(config);
});
setRatioConfigs(newConfigs);
onChange?.(newConfigs);
};
// 更新数据集配比项
const updateDatasetQuantity = (datasetId: string, quantity: number) => {
setRatioConfigs((prev) => {
const existingIndex = prev.findIndex(
(config) => config.source === datasetId
);
const totalOtherQuantity = prev
.filter((config) => config.source !== datasetId)
.reduce((sum, config) => sum + config.quantity, 0);
const dataset = datasets.find((d) => String(d.id) === datasetId);
const newConfig: RatioConfigItem = {
id: datasetId,
name: dataset?.name || datasetId,
type: ratioType,
quantity: Math.min(quantity, totalTargetCount - totalOtherQuantity),
percentage: Math.round((quantity / totalTargetCount) * 100),
source: datasetId,
labelFilter: datasetFilters[datasetId]?.labelFilter,
dateRange: datasetFilters[datasetId]?.dateRange,
};
let newConfigs;
if (existingIndex >= 0) {
newConfigs = [...prev];
newConfigs[existingIndex] = newConfig;
} else {
newConfigs = [...prev, newConfig];
}
onChange?.(newConfigs);
return newConfigs;
});
};
// 更新筛选条件
const updateFilters = (datasetId: string, updates: {
labelFilter?: string;
dateRange?: [string, string];
}) => {
setDatasetFilters(prev => ({
...prev,
[datasetId]: {
...prev[datasetId],
...updates,
}
}));
};
// 渲染筛选器
const renderFilters = (datasetId: string) => {
const labels = getDatasetLabels(datasetId);
const config = ratioConfigs.find(c => c.source === datasetId);
const filters = datasetFilters[datasetId] || {};
return (
<div className="mb-3 p-2 bg-gray-50 rounded">
<div className="flex items-center gap-2 mb-2">
<Filter size={14} className="text-gray-400" />
<span className="text-xs font-medium"></span>
</div>
<div className="grid grid-cols-1 md:grid-cols-2 gap-3">
<div>
<div className="text-xs text-gray-500 mb-1"></div>
<Select
style={{ width: '100%' }}
placeholder="选择标签"
value={filters.labelFilter}
onChange={(value) => updateFilters(datasetId, { labelFilter: value })}
allowClear
onClear={() => updateFilters(datasetId, { labelFilter: undefined })}
>
{labels.map(label => (
<Option key={label} value={label}>{label}</Option>
))}
</Select>
</div>
<div>
<div className="text-xs text-gray-500 mb-1"></div>
<Select
style={{ width: '100%' }}
placeholder="选择标签更新时间"
value={filters.dateRange}
onChange={(dates) => updateFilters(datasetId, { dateRange: dates })}
allowClear
onClear={() => updateFilters(datasetId, { dateRange: undefined })}
>
{TIME_RANGE_OPTIONS.map(option => (
<Option key={option.value} value={option.value}>
{option.label}
</Option>
))}
</Select>
</div>
</div>
</div>
useEffect(() => {
const keep = ratioConfigs.filter((c) =>
selectedDatasets.includes(c.source)
);
};
// 选中数据集变化时,初始化筛选条件
React.useEffect(() => {
const initialFilters: Record<string, any> = {};
selectedDatasets.forEach(datasetId => {
const config = ratioConfigs.find(c => c.source === datasetId);
if (config) {
initialFilters[datasetId] = {
labelFilter: config.labelFilter,
dateRange: config.dateRange,
};
}
});
setDatasetFilters(prev => ({ ...prev, ...initialFilters }));
if (keep.length !== ratioConfigs.length) {
const adjusted = recomputePercentages(keep);
setRatioConfigs(adjusted);
onChange?.(adjusted);
}
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [selectedDatasets]);
return (
@@ -209,15 +158,18 @@ const RatioConfig: React.FC<RatioConfigProps> = ({
(:{totalConfigured}/{totalTargetCount})
</span>
</span>
<Button
type="link"
size="small"
onClick={generateAutoRatio}
disabled={selectedDatasets.length === 0}
>
</Button>
<div className="flex items-center gap-2">
<Button
type="link"
size="small"
onClick={generateAutoRatio}
disabled={selectedDatasets.length === 0}
>
</Button>
</div>
</div>
{selectedDatasets.length === 0 ? (
<div className="text-center py-8 text-gray-500">
<BarChart3 className="w-12 h-12 mx-auto mb-2 text-gray-300" />
@@ -225,7 +177,6 @@ const RatioConfig: React.FC<RatioConfigProps> = ({
</div>
) : (
<div className="flex-overflow-auto gap-4 p-4">
{/* 配比预览 */}
{ratioConfigs.length > 0 && (
<div className="p-3 bg-gray-50 rounded-lg mb-4">
<div className="grid grid-cols-2 gap-4 text-sm">
@@ -250,54 +201,146 @@ const RatioConfig: React.FC<RatioConfigProps> = ({
<div className="flex-1 overflow-auto space-y-4">
{selectedDatasets.map((datasetId) => {
const dataset = datasets.find((d) => String(d.id) === datasetId);
const config = ratioConfigs.find((c) => c.source === datasetId);
const currentQuantity = config?.quantity || 0;
if (!dataset) return null;
const datasetConfigs = ratioConfigs.filter(
(c) => c.source === datasetId
);
const labels = getDatasetLabels(datasetId);
const usedLabels = datasetConfigs
.map((c) => c.labelFilter)
.filter(Boolean) as string[];
const columns = [
{
title: "配比项",
dataIndex: "id",
key: "id",
render: (_: any, record: RatioConfigItem) => (
<Space>
<Filter size={14} className="text-gray-400" />
<span className="text-sm">{record.name}</span>
</Space>
),
},
{
title: "标签筛选",
dataIndex: "labelFilter",
key: "labelFilter",
render: (_: any, record: RatioConfigItem) => {
const availableLabels = labels
.map((l) => ({ label: l, value: l }))
.filter(
(opt) =>
opt.value === record.labelFilter ||
!usedLabels.includes(opt.value)
);
return (
<Select
style={{ width: "160px" }}
placeholder="选择标签"
value={record.labelFilter}
options={availableLabels}
allowClear
onChange={(value) =>
updateConfig(record.id, {
labelFilter: value || undefined,
})
}
/>
);
},
},
{
title: "标签更新时间",
dataIndex: "dateRange",
key: "dateRange",
render: (_: any, record: RatioConfigItem) => (
<Select
style={{ width: "140px" }}
placeholder="选择标签更新时间"
value={record.dateRange}
options={TIME_RANGE_OPTIONS}
allowClear
onChange={(value) =>
updateConfig(record.id, {
dateRange: value || undefined,
})
}
/>
),
},
{
title: "数量",
dataIndex: "quantity",
key: "quantity",
render: (_: any, record: RatioConfigItem) => (
<InputNumber
min={0}
max={Math.min(dataset.fileCount || 0, totalTargetCount)}
value={record.quantity}
onChange={(v) =>
updateConfig(record.id, { quantity: Number(v || 0) })
}
/>
),
},
{
title: "占比",
dataIndex: "percentage",
key: "percentage",
render: (_: any, record: RatioConfigItem) => (
<div style={{ minWidth: 140 }}>
<div className="text-xs mb-1">
{record.percentage ?? 0}%
</div>
<Progress
percent={record.percentage ?? 0}
size="small"
showInfo={false}
/>
</div>
),
},
{
title: "操作",
dataIndex: "actions",
key: "actions",
render: (_: any, record: RatioConfigItem) => (
<Button danger size="small" onClick={() => removeConfig(record.id)}>
</Button>
),
},
];
return (
<Card key={datasetId} size="small" className="mb-4">
<div className="flex items-center justify-between mb-3">
<div className="flex items-center gap-2">
<span className="font-medium text-sm">
{dataset.name}
</span>
<span className="font-medium text-sm">{dataset.name}</span>
<Badge color="gray">{dataset.fileCount}</Badge>
</div>
<div className="text-xs text-gray-500">
{config?.percentage || 0}%
{datasetConfigs.reduce((s, c) => s + (c.percentage || 0), 0)}%
</div>
</div>
{/* 筛选条件 */}
{renderFilters(datasetId)}
<div className="flex items-center gap-2 mb-2">
<span className="text-xs">:</span>
<Input
type="number"
value={currentQuantity}
onChange={(e) =>
updateDatasetQuantity(
datasetId,
Number(e.target.value)
)
}
style={{ width: 100 }}
min={0}
max={Math.min(
dataset.fileCount || 0,
totalTargetCount
)}
/>
<span className="text-xs text-gray-500"></span>
</div>
<Progress
percent={Math.round(
(currentQuantity / totalTargetCount) * 100
)}
<Table
dataSource={datasetConfigs}
columns={columns}
pagination={false}
rowKey="id"
size="small"
locale={{ emptyText: "暂无配比项,请添加" }}
/>
<div className="flex justify-end mt-3">
<Button size="small" onClick={() => addConfig(datasetId)}>
</Button>
</div>
</Card>
);
})}

View File

@@ -177,11 +177,6 @@ export default function RatioTaskDetail() {
<Badge color={ratioTask.status?.color} text={ratioTask.status?.label} />
),
},
{
key: "type",
label: "配比方式",
children: ratioTask.type || "未知",
},
{
key: "createdBy",
label: "创建者",

View File

@@ -95,12 +95,6 @@ export default function RatioTasksPage() {
);
},
},
{
title: "配比方式",
dataIndex: "ratio_method",
key: "ratio_method",
width: 120,
},
{
title: "目标数量",
dataIndex: "totals",

View File

@@ -28,7 +28,6 @@ class RatioInstance(Base):
name = Column(String(64), nullable=True, comment="名称")
description = Column(Text, nullable=True, comment="描述")
target_dataset_id = Column(String(64), nullable=True, comment="模板数据集ID")
ratio_method = Column(String(50), nullable=True, comment="配比方式,按标签(TAG),按数据集(DATASET)")
ratio_parameters = Column(JSON, nullable=True, comment="配比参数")
merge_method = Column(String(50), nullable=True, comment="合并方式")
status = Column(String(20), nullable=True, comment="状态")
@@ -39,7 +38,7 @@ class RatioInstance(Base):
updated_by = Column(String(255), nullable=True, comment="更新者")
def __repr__(self) -> str:
return f"<RatioInstance(id={self.id}, name={self.name}, method={self.ratio_method}, status={self.status})>"
return f"<RatioInstance(id={self.id}, name={self.name}, status={self.status})>"
class RatioRelation(Base):

View File

@@ -27,15 +27,15 @@ class PagedDatasetFileResponse(BaseModel):
size: int = Field(..., description="每页大小")
class DatasetFileTag(BaseModel):
id: str = Field(..., description="标签ID")
type: str = Field(..., description="类型")
from_name: str = Field(..., description="标签名称")
value: dict = Field(..., description="标签值")
id: str = Field(None, description="标签ID")
type: str = Field(None, description="类型")
from_name: str = Field(None, description="标签名称")
values: dict = Field(None, description="标签值")
def get_tags(self) -> List[str]:
tags = []
# 如果 value 是字典类型,根据 type 获取对应的值
tag_values = self.value.get(self.type, [])
# 如果 values 是字典类型,根据 type 获取对应的值
tag_values = self.values.get(self.type, [])
# 处理标签值
if isinstance(tag_values, list):
@@ -55,7 +55,7 @@ class FileTagUpdate(BaseModel):
"""单个文件的标签更新请求"""
file_id: str = Field(..., alias="fileId", description="文件ID")
tags: List[Dict[str, Any]] = Field(..., description="要更新的标签列表(部分更新)")
class Config:
populate_by_name = True
@@ -63,7 +63,7 @@ class FileTagUpdate(BaseModel):
class BatchUpdateFileTagsRequest(BaseModel):
"""批量更新文件标签请求"""
updates: List[FileTagUpdate] = Field(..., description="文件标签更新列表", min_length=1)
class Config:
populate_by_name = True
@@ -74,7 +74,7 @@ class FileTagUpdateResult(BaseModel):
success: bool = Field(..., description="是否更新成功")
message: Optional[str] = Field(None, description="结果信息")
tags_updated_at: Optional[datetime] = Field(None, alias="tagsUpdatedAt", description="标签更新时间")
class Config:
populate_by_name = True
@@ -85,6 +85,6 @@ class BatchUpdateFileTagsResponse(BaseModel):
total: int = Field(..., description="总更新数量")
success_count: int = Field(..., alias="successCount", description="成功数量")
failure_count: int = Field(..., alias="failureCount", description="失败数量")
class Config:
populate_by_name = True

View File

@@ -170,7 +170,6 @@ async def list_ratio_tasks(
description=i.description,
status=i.status,
totals=i.totals,
ratio_method=i.ratio_method,
target_dataset_id=i.target_dataset_id,
target_dataset_name=(ds.name if ds else None),
created_at=str(i.created_at) if getattr(i, "created_at", None) else None,
@@ -330,7 +329,6 @@ async def get_ratio_task(
description=instance.description,
status=instance.status or "UNKNOWN",
totals=instance.totals or 0,
ratio_method=instance.ratio_method or "",
config=config,
target_dataset=target_dataset_info,
created_at=instance.created_at,

View File

@@ -88,7 +88,6 @@ class RatioTaskItem(BaseModel):
description: Optional[str] = None
status: Optional[str] = None
totals: Optional[int] = None
ratio_method: Optional[str] = None
target_dataset_id: Optional[str] = None
target_dataset_name: Optional[str] = None
created_at: Optional[str] = None
@@ -110,7 +109,6 @@ class RatioTaskDetailResponse(BaseModel):
description: Optional[str] = Field(None, description="任务描述")
status: str = Field(..., description="任务状态")
totals: int = Field(..., description="目标总数")
ratio_method: str = Field(..., description="配比方式")
config: List[Dict[str, Any]] = Field(..., description="配比配置")
target_dataset: Dict[str, Any] = Field(..., description="目标数据集信息")
created_at: Optional[datetime] = Field(None, description="创建时间")

View File

@@ -1,3 +1,4 @@
from datetime import datetime
from typing import List, Optional, Dict, Any
import random
import json
@@ -173,7 +174,7 @@ class RatioTaskService:
@staticmethod
async def handle_selected_file(existing_paths: set[Any], f, session, target_ds: Dataset):
src_path = f.file_path
dst_prefix = f"/dataset/{target_ds.id}"
dst_prefix = f"/dataset/{target_ds.id}/"
file_name = RatioTaskService.get_new_file_name(dst_prefix, existing_paths, f)
new_path = dst_prefix + file_name
@@ -181,18 +182,20 @@ class RatioTaskService:
await asyncio.to_thread(os.makedirs, dst_dir, exist_ok=True)
await asyncio.to_thread(shutil.copy2, src_path, new_path)
new_file = DatasetFiles(
dataset_id=target_ds.id, # type: ignore
file_name=file_name,
file_path=new_path,
file_type=f.file_type,
file_size=f.file_size,
check_sum=f.check_sum,
tags=f.tags,
dataset_filemetadata=f.dataset_filemetadata,
status="ACTIVE",
)
session.add(new_file)
file_data = {
"dataset_id": target_ds.id, # type: ignore
"file_name": file_name,
"file_path": new_path,
"file_type": f.file_type,
"file_size": f.file_size,
"check_sum": f.check_sum,
"tags": f.tags,
"tags_updated_at": datetime.now(),
"dataset_filemetadata": f.dataset_filemetadata,
"status": "ACTIVE",
}
file_record = {k: v for k, v in file_data.items() if v is not None}
session.add(DatasetFiles(**file_record))
existing_paths.add(new_path)
@staticmethod

View File

@@ -6,7 +6,6 @@ CREATE TABLE IF NOT EXISTS t_st_ratio_instances
name varchar(64) COMMENT '名称',
description TEXT COMMENT '描述',
target_dataset_id varchar(64) COMMENT '模板数据集ID',
ratio_method varchar(50) COMMENT '配比方式,按标签(TAG),按数据集(DATASET)',
ratio_parameters JSON COMMENT '配比参数',
merge_method varchar(50) COMMENT '合并方式',
status varchar(20) COMMENT '状态',