feat: enhance backend deployment, frontend file selection and synthesis task management (#129)

* feat: Implement data synthesis task management with database models and API endpoints

* feat: Update Python version requirements and refine dependency constraints in configuration

* fix: Correctly extract file values from selectedFilesMap in AddDataDialog

* feat: Refactor synthesis task routes and enhance file task management in the API

* feat: Enhance SynthesisTaskTab with tooltip actions and add chunk data retrieval in API
This commit is contained in:
Dallas98
2025-12-04 09:57:13 +08:00
committed by GitHub
parent 1d19cd3a62
commit 7012a9ad98
14 changed files with 975 additions and 1193 deletions

View File

@@ -5,12 +5,14 @@ on:
branches: [ "main" ]
paths:
- 'scripts/images/datamate-python/**'
- 'runtime/datamate-python/**'
- '.github/workflows/docker-image-backend-python.yml'
- '.github/workflows/docker-images-reusable.yml'
pull_request:
branches: [ "main" ]
paths:
- 'scripts/images/datamate-python/**'
- 'runtime/datamate-python/**'
- '.github/workflows/docker-image-backend-python.yml'
- '.github/workflows/docker-images-reusable.yml'
workflow_dispatch:

View File

@@ -10,6 +10,7 @@ services:
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
restart: always
networks:
- datamate
healthcheck:

View File

@@ -1,4 +1,4 @@
import React, { useEffect } from "react";
import React, { useCallback, useEffect } from "react";
import { Button, Input, Table } from "antd";
import { RightOutlined } from "@ant-design/icons";
import { mapDataset } from "@/pages/DataManagement/dataset.const";
@@ -19,6 +19,7 @@ interface DatasetFileTransferProps
open: boolean;
selectedFilesMap: { [key: string]: DatasetFile };
onSelectedFilesChange: (filesMap: { [key: string]: DatasetFile }) => void;
onDatasetSelect?: (dataset: Dataset | null) => void;
}
const fileCols = [
@@ -48,6 +49,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
open,
selectedFilesMap,
onSelectedFilesChange,
onDatasetSelect,
...props
}) => {
const [datasets, setDatasets] = React.useState<Dataset[]>([]);
@@ -96,7 +98,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
300
);
const fetchFiles = async () => {
const fetchFiles = useCallback(async () => {
if (!selectedDataset) return;
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
page: filesPagination.current - 1,
@@ -104,23 +106,25 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
keyword: filesSearch,
});
setFiles(
data.content.map((item) => ({
(data.content || []).map((item: DatasetFile) => ({
...item,
key: item.id,
datasetName: selectedDataset.name,
})) || []
}))
);
setFilesPagination((prev) => ({
...prev,
total: data.totalElements,
}));
};
}, [filesPagination.current, filesPagination.pageSize, filesSearch, selectedDataset]);
useEffect(() => {
if (selectedDataset) {
fetchFiles();
}
}, [selectedDataset]);
fetchFiles().catch(() => {});
}, [fetchFiles]);
useEffect(() => {
onDatasetSelect?.(selectedDataset);
}, [selectedDataset, onDatasetSelect]);
const toggleSelectFile = (record: DatasetFile) => {
if (!selectedFilesMap[record.id]) {
@@ -147,8 +151,9 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
setShowFiles(false);
setSelectedDataset(null);
setDatasetSelections([]);
onDatasetSelect?.(null);
}
}, [open]);
}, [open, onDatasetSelect]);
const datasetCols = [
{
@@ -206,7 +211,15 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
})}
dataSource={datasets}
columns={datasetCols}
pagination={datasetPagination}
pagination={{
...datasetPagination,
onChange: (page, pageSize) =>
setDatasetPagination({
current: page,
pageSize: pageSize || datasetPagination.pageSize,
total: datasetPagination.total,
}),
}}
/>
</div>
<RightOutlined />
@@ -231,21 +244,11 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
})}
rowSelection={{
type: "checkbox",
onSelectAll: (selected, _, changeRows) => {
const newSelectedFiles = { ...selectedFilesMap };
if (selected) {
changeRows.forEach((row) => {
newSelectedFiles[row.id] = row;
});
} else {
changeRows.forEach((row) => {
delete newSelectedFiles[row.id];
});
}
onSelectedFilesChange(newSelectedFiles);
},
selectedRowKeys: Object.keys(selectedFilesMap),
onSelect: toggleSelectFile,
getCheckboxProps: (record: DatasetFile) => ({
name: record.fileName,
}),
}}
/>
</div>

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,186 @@
import { useEffect, useState } from "react";
import { useParams, useNavigate } from "react-router";
import { Table, Badge, Button } from "antd";
import type { ColumnsType, TablePaginationConfig } from "antd/es/table";
import { querySynthesisFileTasksUsingGet, querySynthesisTaskByIdUsingGet } from "@/pages/SynthesisTask/synthesis-api";
import type { BadgeProps } from "antd";
import { formatDateTime } from "@/utils/unit";
interface SynthesisFileTaskItem {
id: string;
synthesis_instance_id: string;
file_name: string;
source_file_id: string;
target_file_location: string;
status?: string;
total_chunks: number;
processed_chunks: number;
created_at?: string;
updated_at?: string;
}
interface PagedResponse<T> {
content: T[];
totalElements: number;
totalPages: number;
page: number;
size: number;
}
interface SynthesisTaskInfo {
id: string;
name: string;
synthesis_type: string;
status: string;
created_at: string;
model_id: string;
}
export default function SynthFileTask() {
const { id: taskId = "" } = useParams();
const navigate = useNavigate();
const [loading, setLoading] = useState(false);
const [data, setData] = useState<SynthesisFileTaskItem[]>([]);
const [pagination, setPagination] = useState<TablePaginationConfig>({
current: 1,
pageSize: 10,
total: 0,
});
const [taskInfo, setTaskInfo] = useState<SynthesisTaskInfo | null>(null);
// 查询总任务详情
useEffect(() => {
if (!taskId) return;
querySynthesisTaskByIdUsingGet(taskId).then((res) => {
setTaskInfo(res?.data?.data || null);
});
}, [taskId]);
const fetchData = async (page = 1, pageSize = 10) => {
if (!taskId) return;
setLoading(true);
try {
const res = await querySynthesisFileTasksUsingGet(taskId, {
page,
page_size: pageSize,
});
const payload: PagedResponse<SynthesisFileTaskItem> =
res?.data?.data ?? res?.data ?? {
content: [],
totalElements: 0,
totalPages: 0,
page,
size: pageSize,
};
setData(payload.content || []);
setPagination({
current: payload.page ?? page,
pageSize: payload.size ?? pageSize,
total: payload.totalElements ?? payload.content?.length ?? 0,
});
} finally {
setLoading(false);
}
};
useEffect(() => {
fetchData(1, pagination.pageSize || 10);
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [taskId]);
const handleTableChange = (pag: TablePaginationConfig) => {
fetchData(pag.current || 1, pag.pageSize || 10);
};
const columns: ColumnsType<SynthesisFileTaskItem> = [
{
title: "文件名",
dataIndex: "file_name",
key: "file_name",
},
{
title: "状态",
dataIndex: "status",
key: "status",
render: (status?: string) => {
let badgeStatus: BadgeProps["status"] = "default";
let text = status || "未知";
if (status === "pending" || status === "processing") {
badgeStatus = "processing";
text = "处理中";
} else if (status === "completed") {
badgeStatus = "success";
text = "已完成";
} else if (status === "failed") {
badgeStatus = "error";
text = "失败";
}
return <Badge status={badgeStatus} text={text} />;
},
},
{
title: "切片进度",
key: "chunks",
render: (_text, record) => (
<span>
{record.processed_chunks}/{record.total_chunks}
</span>
),
},
{
title: "目标文件路径",
dataIndex: "target_file_location",
key: "target_file_location",
ellipsis: true,
},
{
title: "创建时间",
dataIndex: "created_at",
key: "created_at",
render: (val?: string) => (val ? formatDateTime(val) : "-"),
},
{
title: "更新时间",
dataIndex: "updated_at",
key: "updated_at",
render: (val?: string) => (val ? formatDateTime(val) : "-"),
},
];
return (
<div className="p-4 bg-white rounded-lg h-full flex flex-col">
{/* 顶部任务信息和返回按钮 */}
<div className="flex items-center justify-between mb-4">
<div className="space-y-1">
{taskInfo && (
<>
<div className="text-lg font-medium flex items-center gap-2">
<span>{taskInfo.name}</span>
<span className="text-xs px-2 py-0.5 rounded bg-blue-50 text-blue-700 border border-blue-200">
{taskInfo.synthesis_type === "QA" ? "问答对生成" : taskInfo.synthesis_type === "COT" ? "链式推理生成" : taskInfo.synthesis_type}
</span>
<span className="text-xs px-2 py-0.5 rounded bg-gray-50 text-gray-700 border border-gray-200">
{taskInfo.status === "pending" ? "等待中" : taskInfo.status === "completed" ? "已完成" : taskInfo.status === "failed" ? "失败" : taskInfo.status}
</span>
</div>
<div className="text-xs text-gray-500 flex gap-4">
<span>{formatDateTime(taskInfo.created_at)}</span>
<span>ID{taskInfo.model_id}</span>
</div>
</>
)}
</div>
<Button type="default" onClick={() => navigate("/data/synthesis/task")}></Button>
</div>
{/* 文件任务表格 */}
<Table<SynthesisFileTaskItem>
rowKey="id"
loading={loading}
dataSource={data}
columns={columns}
pagination={pagination}
onChange={handleTableChange}
/>
</div>
);
}

View File

@@ -1,112 +1,116 @@
import { useState } from "react";
import { Card, Button, Badge, Table, Progress } from "antd";
import { useState, useEffect, ElementType } from "react";
import { Card, Button, Badge, Table, Modal, message, Tooltip } from "antd";
import {
Plus,
Sparkles,
ArrowUp,
ArrowDown,
Pause,
Play,
DownloadIcon,
CheckCircle,
Check,
StopCircle,
Sparkles,
} from "lucide-react";
import type { SynthesisTask } from "@/pages/SynthesisTask/synthesis";
import { mockSynthesisTasks } from "@/mock/synthesis";
import { DeleteOutlined, EyeOutlined } from "@ant-design/icons";
import { Link, useNavigate } from "react-router";
import { SearchControls } from "@/components/SearchControls";
import { formatDateTime } from "@/utils/unit";
import {
querySynthesisTasksUsingGet,
deleteSynthesisTaskByIdUsingDelete,
} from "@/pages/SynthesisTask/synthesis-api";
interface SynthesisTask {
id: string;
name: string;
description?: string;
status: string;
synthesis_type: string;
model_id: string;
progress?: number;
result_data_location?: string;
text_split_config?: {
chunk_size: number;
chunk_overlap: number;
};
synthesis_config?: {
temperature?: number | null;
prompt_template?: string;
synthesis_count?: number | null;
};
source_file_id?: string[];
total_files?: number;
processed_files?: number;
total_chunks?: number;
processed_chunks?: number;
total_synthesis_data?: number;
created_at: string;
updated_at?: string;
created_by?: string;
updated_by?: string;
}
export default function SynthesisTaskTab() {
const navigate = useNavigate();
const [searchQuery, setSearchQuery] = useState("");
const [tasks, setTasks] = useState<SynthesisTask[]>(mockSynthesisTasks);
const [tasks, setTasks] = useState<SynthesisTask[]>([]);
const [filterStatus, setFilterStatus] = useState("all");
const [sortBy, setSortBy] = useState<"createdAt" | "name">("createdAt");
const [sortOrder, setSortOrder] = useState<"asc" | "desc">("desc");
const [page, setPage] = useState(1);
const [pageSize, setPageSize] = useState(10);
const [total, setTotal] = useState(0);
const [loading, setLoading] = useState(false);
// 过滤任务
const filteredTasks = tasks.filter((task) => {
const matchesSearch =
task.name.toLowerCase().includes(searchQuery.toLowerCase()) ||
task.template.toLowerCase().includes(searchQuery.toLowerCase());
const matchesStatus =
filterStatus === "all" || task.status === filterStatus;
return matchesSearch && matchesStatus;
});
// 排序任务
const sortedTasks = [...filteredTasks].sort((a, b) => {
if (sortBy === "createdAt") {
const dateA = new Date(a.createdAt).getTime();
const dateB = new Date(b.createdAt).getTime();
return sortOrder === "asc" ? dateA - dateB : dateB - dateA;
} else if (sortBy === "name") {
return sortOrder === "asc"
? a.name.localeCompare(b.name)
: b.name.localeCompare(a.name);
// 获取任务列表
const loadTasks = async () => {
setLoading(true);
try {
const params = {
page: page,
page_size: pageSize,
} as {
page?: number;
page_size?: number;
synthesis_type?: string;
status?: string;
name?: string;
};
if (searchQuery) params.name = searchQuery;
if (filterStatus !== "all") params.synthesis_type = filterStatus;
const res = await querySynthesisTasksUsingGet(params);
setTasks(res?.data?.content || []);
setTotal(res?.data?.totalElements || 0);
} catch {
setTasks([]);
setTotal(0);
} finally {
setLoading(false);
}
return 0;
});
const handleTaskAction = (taskId: number, action: string) => {
setTasks((prev) =>
prev.map((task) => {
if (task.id === taskId) {
switch (action) {
case "pause":
return { ...task, status: "paused" as const };
case "resume":
return { ...task, status: "running" as const };
case "stop":
return {
...task,
status: "failed" as const,
progress: task.progress,
};
default:
return task;
}
}
return task;
})
);
};
useEffect(() => {
loadTasks();
// eslint-disable-next-line
}, [searchQuery, filterStatus, page, pageSize]);
// 状态徽章
const getStatusBadge = (status: string) => {
const statusConfig = {
pending: {
label: "等待中",
color: "#F59E0B",
icon: Pause,
},
running: {
label: "运行中",
color: "#3B82F6",
icon: Play,
},
completed: {
label: "已完成",
color: "#10B981",
icon: CheckCircle,
},
failed: {
label: "失败",
color: "#EF4444",
icon: Pause,
},
paused: {
label: "已暂停",
color: "#E5E7EB",
icon: Pause,
},
const statusConfig: Record<string, { label: string; color: string; icon: ElementType }> = {
pending: { label: "等待中", color: "#F59E0B", icon: Pause },
running: { label: "运行中", color: "#3B82F6", icon: Play },
completed: { label: "已完成", color: "#10B981", icon: CheckCircle },
failed: { label: "失败", color: "#EF4444", icon: Pause },
paused: { label: "已暂停", color: "#E5E7EB", icon: Pause },
};
return (
statusConfig[status as keyof typeof statusConfig] || statusConfig.pending
);
return statusConfig[status] ?? statusConfig["pending"];
};
// 任务表格列
// 类型映射
const typeMap: Record<string, string> = {
QA: "问答对生成",
COT: "链式推理生成",
};
// 表格列
const taskColumns = [
{
title: (
@@ -134,98 +138,77 @@ export default function SynthesisTaskTab() {
dataIndex: "name",
key: "name",
fixed: "left" as const,
render: (text: string, task: SynthesisTask) => (
render: (_: unknown, task: SynthesisTask) => (
<div className="flex items-center gap-3">
<div className="w-8 h-8 bg-blue-500 rounded-lg flex items-center justify-center shadow-sm">
{/* 可根据 type 渲染不同图标 */}
<span className="text-white font-bold text-base">
{task.type?.toUpperCase()?.slice(0, 1) || "T"}
{task.synthesis_type?.toUpperCase()?.slice(0, 1) || "T"}
</span>
</div>
<div>
<Link to={`/data/synthesis/task/${task.id}`}>{task.name}</Link>
<div className="text-xs text-gray-500">{task.template}</div>
</div>
</div>
),
},
{
title: "类型",
dataIndex: "type",
key: "type",
render: (type: string) => type.toUpperCase(),
dataIndex: "synthesis_type",
key: "synthesis_type",
render: (type: string) => typeMap[type] || type,
},
{
title: "状态",
dataIndex: "status",
key: "status",
render: (status: string) => {
const statusConfig = getStatusBadge(status);
return <Badge color={statusConfig.color} text={statusConfig.label} />;
},
},
{
title: "进度",
dataIndex: "progress",
key: "progress",
width: 150,
render: (_: any, task: SynthesisTask) => (
<Progress percent={task.progress} size="small" />
),
},
{
title: "源数据集",
dataIndex: "sourceDataset",
key: "sourceDataset",
render: (text: string) => (
<div className="text-sm text-gray-900">{text}</div>
),
},
{
title: "生成数量",
dataIndex: "generatedCount",
key: "generatedCount",
render: (_: any, task: SynthesisTask) => (
<div className="text-sm font-medium text-gray-900">
{task.generatedCount?.toLocaleString?.()} /{" "}
{task.targetCount?.toLocaleString?.()}
</div>
),
},
{
title: "质量评分",
dataIndex: "quality",
key: "quality",
render: (quality: number) => (quality ? `${quality}%` : "-"),
title: "文件数",
dataIndex: "total_files",
key: "total_files",
render: (num: number, task: SynthesisTask) => <span>{num ?? (task.source_file_id?.length ?? 0)}</span>,
},
{
title: "创建时间",
dataIndex: "createdAt",
key: "createdAt",
render: formatDateTime,
dataIndex: "created_at",
key: "created_at",
render: (val: string) => formatDateTime(val),
},
{
title: "操作",
key: "actions",
fixed: "right" as const,
render: (_: any, task: SynthesisTask) => (
render: (_: unknown, task: SynthesisTask) => (
<div className="flex items-center justify-center gap-1">
{task.status === "running" && (
<Tooltip title="查看详情">
<Button
onClick={() => handleTaskAction(task.id, "pause")}
className="hover:bg-orange-50 p-1 h-7 w-7"
onClick={() => navigate(`/data/synthesis/task/${task.id}`)}
className="hover:bg-blue-50 p-1 h-7 w-7"
type="text"
icon={<Pause className="w-4 h-4" />}
></Button>
)}
{task.status === "paused" && (
icon={<EyeOutlined />}
/>
</Tooltip>
<Tooltip title="删除任务">
<Button
onClick={() => handleTaskAction(task.id, "resume")}
className="hover:bg-green-50 p-1 h-7 w-7"
danger
type="text"
icon={<Play className="w-4 h-4" />}
></Button>
)}
className="hover:bg-red-50 p-1 h-7 w-7"
icon={<DeleteOutlined />}
onClick={() => {
Modal.confirm({
title: `确认删除任务?`,
content: `任务名:${task.name}`,
okText: "删除",
okType: "danger",
cancelText: "取消",
onOk: async () => {
try {
await deleteSynthesisTaskByIdUsingDelete(task.id);
message.success("删除成功");
loadTasks();
} catch {
message.error("删除失败");
}
},
});
}}
/>
</Tooltip>
</div>
),
},
@@ -237,18 +220,15 @@ export default function SynthesisTaskTab() {
<SearchControls
searchTerm={searchQuery}
onSearchChange={setSearchQuery}
searchPlaceholder="搜索任务名称或模板..."
searchPlaceholder="搜索任务名称..."
filters={[
{
key: "status",
label: "状态",
label: "类型",
options: [
{ label: "全部状态", value: "all" },
{ label: "等待中", value: "pending" },
{ label: "运行中", value: "running" },
{ label: "已完成", value: "completed" },
{ label: "失败", value: "failed" },
{ label: "已暂停", value: "paused" },
{ label: "全部类型", value: "all" },
{ label: "问答对生成", value: "QA" },
{ label: "链式推理生成", value: "COT" },
],
},
]}
@@ -259,13 +239,23 @@ export default function SynthesisTaskTab() {
showFilters
showViewToggle={false}
/>
{/* 任务表格 */}
<Card>
<Table
columns={taskColumns}
dataSource={sortedTasks}
dataSource={tasks}
rowKey="id"
loading={loading}
pagination={{
current: page,
pageSize: pageSize,
total: total,
onChange: (p, ps) => {
setPage(p);
setPageSize(ps);
},
showSizeChanger: true,
}}
scroll={{ x: "max-content" }}
locale={{
emptyText: (

View File

@@ -0,0 +1,37 @@
import { get, post, del } from "@/utils/request";
// 创建数据合成任务
export function createSynthesisTaskUsingPost(data: unknown) {
return post("/api/synthesis/gen/task", data);
}
// 获取数据合成任务详情
export function querySynthesisTaskByIdUsingGet(taskId: string) {
return get(`/api/synthesis/gen/task/${taskId}`);
}
// 分页查询数据合成任务列表
export function querySynthesisTasksUsingGet(params: {
page?: number;
page_size?: number;
synthesis_type?: string;
status?: string;
name?: string;
}) {
return get(`/api/synthesis/gen/tasks`, params as any);
}
// 删除整个数据合成任务
export function deleteSynthesisTaskByIdUsingDelete(taskId: string) {
return del(`/api/synthesis/gen/task/${taskId}`);
}
// 分页查询某个任务下的文件任务列表
export function querySynthesisFileTasksUsingGet(taskId: string, params: { page?: number; page_size?: number }) {
return get(`/api/synthesis/gen/task/${taskId}/files`, params as any);
}
// 获取不同合成类型对应的 Prompt
export function getPromptByTypeUsingGet(synthType: string) {
return get(`/api/synthesis/gen/prompt`, { synth_type: synthType } as any);
}

View File

@@ -40,6 +40,7 @@ import { withErrorBoundary } from "@/components/ErrorBoundary";
import AgentPage from "@/pages/Agent/Agent.tsx";
import RatioTaskDetail from "@/pages/RatioTask/Detail/RatioTaskDetail";
import CleansingTemplateDetail from "@/pages/DataCleansing/Detail/TemplateDetail";
import SynthFileTask from "@/pages/SynthesisTask/SynthFileTask.tsx";
import EvaluationDetailPage from "@/pages/DataEvaluation/Detail/TaskDetail.tsx";
const router = createBrowserRouter([
@@ -160,6 +161,7 @@ const router = createBrowserRouter([
path: "create",
Component: SynthesisTaskCreate,
},
{path: ":id", Component: SynthFileTask},
],
},
{

View File

@@ -1,8 +1,8 @@
from fastapi import APIRouter
router = APIRouter(
prefix="/synth",
tags = ["synth"]
prefix="/synthesis",
tags = ["synthesis"]
)
# Include sub-routers

View File

@@ -18,7 +18,14 @@ from app.db.session import get_db
from app.module.generation.schema.generation import (
CreateSynthesisTaskRequest,
DataSynthesisTaskItem,
PagedDataSynthesisTaskResponse, SynthesisType)
PagedDataSynthesisTaskResponse,
SynthesisType,
DataSynthesisFileTaskItem,
PagedDataSynthesisFileTaskResponse,
DataSynthesisChunkItem,
PagedDataSynthesisChunkResponse,
SynthesisDataItem,
)
from app.module.generation.service.generation_service import GenerationService
from app.module.generation.service.prompt import get_prompt
from app.module.shared.schema import StandardResponse
@@ -219,19 +226,26 @@ async def delete_synthesis_task(
data=None,
)
@router.delete("/task/{task_id}/{file_id}", response_model=StandardResponse[None])
async def delete_synthesis_file_task(
task_id: str,
file_id: str,
db: AsyncSession = Depends(get_db)
):
"""删除数据合成任务中的文件任务"""
"""删除数据合成任务中的文件任务,同时刷新任务表中的文件/切片数量"""
# 先获取任务和文件任务记录
task = await db.get(DataSynthesisInstance, task_id)
if not task:
raise HTTPException(status_code=404, detail="Synthesis task not found")
file_task = await db.get(DataSynthesisFileInstance, file_id)
if not file_task:
raise HTTPException(status_code=404, detail="Synthesis file task not found")
# 删除 SynthesisData(根据文件任务ID)
await db.execute(delete(SynthesisData).where(
await db.execute(
delete(SynthesisData).where(
SynthesisData.synthesis_file_instance_id == file_id
)
)
@@ -243,11 +257,28 @@ async def delete_synthesis_file_task(
)
# 删除文件任务记录
await db.execute(delete(DataSynthesisFileInstance).where(
await db.execute(
delete(DataSynthesisFileInstance).where(
DataSynthesisFileInstance.id == file_id
)
)
# 刷新任务级别统计字段:总文件数、总文本块数、已处理文本块数
if task.total_files and task.total_files > 0:
task.total_files -= 1
if task.total_files < 0:
task.total_files = 0
await db.commit()
await db.refresh(task)
return StandardResponse(
code=200,
message="success",
data=None,
)
@router.get("/prompt", response_model=StandardResponse[str])
async def get_prompt_by_type(
synth_type: SynthesisType,
@@ -258,3 +289,157 @@ async def get_prompt_by_type(
message="Success",
data=prompt,
)
@router.get("/task/{task_id}/files", response_model=StandardResponse[PagedDataSynthesisFileTaskResponse])
async def list_synthesis_file_tasks(
task_id: str,
page: int = 1,
page_size: int = 10,
db: AsyncSession = Depends(get_db),
):
"""分页获取某个数据合成任务下的文件任务列表"""
# 先校验任务是否存在
task = await db.get(DataSynthesisInstance, task_id)
if not task:
raise HTTPException(status_code=404, detail="Synthesis task not found")
base_query = select(DataSynthesisFileInstance).where(
DataSynthesisFileInstance.synthesis_instance_id == task_id
)
count_q = select(func.count()).select_from(base_query.subquery())
total = (await db.execute(count_q)).scalar_one()
if page < 1:
page = 1
if page_size < 1:
page_size = 10
result = await db.execute(
base_query.offset((page - 1) * page_size).limit(page_size)
)
rows = result.scalars().all()
file_items = [
DataSynthesisFileTaskItem(
id=row.id,
synthesis_instance_id=row.synthesis_instance_id,
file_name=row.file_name,
source_file_id=row.source_file_id,
target_file_location=row.target_file_location,
status=row.status,
total_chunks=row.total_chunks,
processed_chunks=row.processed_chunks,
created_at=row.created_at,
updated_at=row.updated_at,
created_by=row.created_by,
updated_by=row.updated_by,
)
for row in rows
]
paged = PagedDataSynthesisFileTaskResponse(
content=file_items,
totalElements=total,
totalPages=(total + page_size - 1) // page_size,
page=page,
size=page_size,
)
return StandardResponse(
code=200,
message="Success",
data=paged,
)
@router.get("/file/{file_id}/chunks", response_model=StandardResponse[PagedDataSynthesisChunkResponse])
async def list_chunks_by_file(
file_id: str,
page: int = 1,
page_size: int = 10,
db: AsyncSession = Depends(get_db),
):
"""根据文件任务 ID 分页查询 chunk 记录"""
# 校验文件任务是否存在
file_task = await db.get(DataSynthesisFileInstance, file_id)
if not file_task:
raise HTTPException(status_code=404, detail="Synthesis file task not found")
base_query = select(DataSynthesisChunkInstance).where(
DataSynthesisChunkInstance.synthesis_file_instance_id == file_id
)
count_q = select(func.count()).select_from(base_query.subquery())
total = (await db.execute(count_q)).scalar_one()
if page < 1:
page = 1
if page_size < 1:
page_size = 10
result = await db.execute(
base_query.order_by(DataSynthesisChunkInstance.chunk_index.asc())
.offset((page - 1) * page_size)
.limit(page_size)
)
rows = result.scalars().all()
chunk_items = [
DataSynthesisChunkItem(
id=row.id,
synthesis_file_instance_id=row.synthesis_file_instance_id,
chunk_index=row.chunk_index,
chunk_content=row.chunk_content,
chunk_metadata=getattr(row, "chunk_metadata", None),
)
for row in rows
]
paged = PagedDataSynthesisChunkResponse(
content=chunk_items,
totalElements=total,
totalPages=(total + page_size - 1) // page_size,
page=page,
size=page_size,
)
return StandardResponse(
code=200,
message="Success",
data=paged,
)
@router.get("/chunk/{chunk_id}/data", response_model=StandardResponse[list[SynthesisDataItem]])
async def list_synthesis_data_by_chunk(
chunk_id: str,
db: AsyncSession = Depends(get_db),
):
"""根据 chunk ID 查询所有合成结果数据"""
# 可选:校验 chunk 是否存在
chunk = await db.get(DataSynthesisChunkInstance, chunk_id)
if not chunk:
raise HTTPException(status_code=404, detail="Chunk not found")
result = await db.execute(
select(SynthesisData).where(SynthesisData.chunk_instance_id == chunk_id)
)
rows = result.scalars().all()
items = [
SynthesisDataItem(
id=row.id,
data=row.data,
synthesis_file_instance_id=row.synthesis_file_instance_id,
chunk_instance_id=row.chunk_instance_id,
)
for row in rows
]
return StandardResponse(
code=200,
message="Success",
data=items,
)

View File

@@ -70,6 +70,67 @@ class PagedDataSynthesisTaskResponse(BaseModel):
page: int
size: int
class DataSynthesisFileTaskItem(BaseModel):
"""数据合成任务下的文件任务项"""
id: str
synthesis_instance_id: str
file_name: str
source_file_id: str
target_file_location: str
status: Optional[str] = None
total_chunks: int
processed_chunks: int
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
created_by: Optional[str] = None
updated_by: Optional[str] = None
class Config:
orm_mode = True
class PagedDataSynthesisFileTaskResponse(BaseModel):
"""分页数据合成任务文件任务响应"""
content: List[DataSynthesisFileTaskItem]
totalElements: int
totalPages: int
page: int
size: int
class DataSynthesisChunkItem(BaseModel):
"""数据合成文件下的 chunk 记录"""
id: str
synthesis_file_instance_id: str
chunk_index: Optional[int] = None
chunk_content: Optional[str] = None
chunk_metadata: Optional[Dict[str, Any]] = None
class Config:
orm_mode = True
class PagedDataSynthesisChunkResponse(BaseModel):
"""分页 chunk 列表响应"""
content: List[DataSynthesisChunkItem]
totalElements: int
totalPages: int
page: int
size: int
class SynthesisDataItem(BaseModel):
"""合成结果数据项"""
id: str
data: Optional[Dict[str, Any]] = None
synthesis_file_instance_id: str
chunk_instance_id: str
class Config:
orm_mode = True
class ChatRequest(BaseModel):
"""聊天请求参数"""
model_id: str

View File

@@ -168,11 +168,11 @@ class GenerationService:
self.db.add(chunk_record)
# 更新文件任务的分块数量
file_task.chunk_count = len(chunks)
file_task.total_chunks = len(chunks)
file_task.status = "processing"
await self.db.refresh(file_task)
await self.db.commit()
await self.db.refresh(file_task)
async def _invoke_llm_for_chunks(
self,

View File

@@ -1,7 +1,6 @@
from app.module.generation.schema.generation import SynthesisType
QA_PROMPT="""
# 角色
QA_PROMPT="""# 角色
你是一位专业的AI助手,擅长从给定的文本中提取关键信息并创建用于教学和测试的问答对。
# 任务
@@ -11,7 +10,7 @@ QA_PROMPT="""
{document}
# 要求与指令
1. **问题类型**:生成{synthesis_count - 1}-{synthesis_count + 1}个问答对。问题类型应多样化,包括但不限于:
1. **问题类型**:生成 {synthesis_count} 个左右的问答对。问题类型应多样化,包括但不限于:
* **事实性**:基于文本中明确提到的事实。
* **理解性**:需要理解上下文和概念。
* **归纳性**:需要总结或归纳多个信息点。
@@ -30,8 +29,7 @@ QA_PROMPT="""
"""
COT_PROMPT="""
# 角色
COT_PROMPT="""# 角色
你是一位专业的数据合成专家,擅长基于给定的原始文档和 COT(Chain of Thought,思维链)逻辑,生成高质量、符合实际应用场景的 COT 数据。COT 数据需包含清晰的问题、逐步推理过程和最终结论,能完整还原解决问题的思考路径。
# 任务
@@ -41,7 +39,7 @@ COT_PROMPT="""
{document}
# 要求与指令
1. **数量要求**:生成 {min\_count}-{max\_count} 条 COT 数据(min\_count={synthesis\_count-1},max\_count={synthesis\_count+1})
1. **数量要求**:生成 {synthesis_count} 条左右的 COT 数据
2. **内容要求**:
* 每条 COT 数据需包含 “问题”“思维链推理”“最终结论” 三部分,逻辑闭环,推理步骤清晰、连贯,不跳跃关键环节。
* 问题需基于文档中的事实信息、概念关联或逻辑疑问,是读完文档后自然产生的有价值问题(避免无意义或过于简单的问题)。