You've already forked DataMate
Compare commits
17 Commits
fb43052ddf
...
lsf
| Author | SHA1 | Date | |
|---|---|---|---|
| 078f303f57 | |||
| 50f2da5503 | |||
| 3af1daf8b6 | |||
| 7c7729434b | |||
| 17a62cd3c2 | |||
| f381d641ab | |||
| c8611d29ff | |||
| 147beb1ec7 | |||
| 699031dae7 | |||
| 88b1383653 | |||
| cc6415c4d9 | |||
| 3d036c4cd6 | |||
| 2445235fd2 | |||
| 893e0a1580 | |||
| 05e6842fc8 | |||
| da5b18e423 | |||
| 31629ab50b |
@@ -246,28 +246,59 @@ offline-build-classic: offline-setup
|
|||||||
offline-diagnose:
|
offline-diagnose:
|
||||||
@bash scripts/offline/diagnose.sh $(CACHE_DIR)
|
@bash scripts/offline/diagnose.sh $(CACHE_DIR)
|
||||||
|
|
||||||
|
# 构建 APT 预装基础镜像(有网环境)
|
||||||
|
.PHONY: offline-build-base-images
|
||||||
|
offline-build-base-images:
|
||||||
|
@echo "构建 APT 预装基础镜像..."
|
||||||
|
@bash scripts/offline/build-base-images.sh $(CACHE_DIR)
|
||||||
|
|
||||||
|
# 使用预装基础镜像进行离线构建(推荐)
|
||||||
|
.PHONY: offline-build-final
|
||||||
|
offline-build-final: offline-setup
|
||||||
|
@echo "使用预装 APT 包的基础镜像进行离线构建..."
|
||||||
|
@bash scripts/offline/build-offline-final.sh $(CACHE_DIR) $(OFFLINE_VERSION)
|
||||||
|
|
||||||
|
# 完整离线导出(包含 APT 预装基础镜像)
|
||||||
|
.PHONY: offline-export-full
|
||||||
|
offline-export-full:
|
||||||
|
@echo "======================================"
|
||||||
|
@echo "完整离线缓存导出(含 APT 预装基础镜像)"
|
||||||
|
@echo "======================================"
|
||||||
|
@$(MAKE) offline-build-base-images
|
||||||
|
@$(MAKE) offline-export
|
||||||
|
@echo ""
|
||||||
|
@echo "导出完成!传输时请包含以下文件:"
|
||||||
|
@echo " - build-cache/images/base-images-with-apt.tar"
|
||||||
|
@echo " - build-cache-YYYYMMDD.tar.gz"
|
||||||
|
|
||||||
# ========== 帮助 ==========
|
# ========== 帮助 ==========
|
||||||
|
|
||||||
.PHONY: help-offline
|
.PHONY: help-offline
|
||||||
help-offline:
|
help-offline:
|
||||||
@echo "离线构建命令:"
|
@echo "离线构建命令:"
|
||||||
@echo " make offline-export [CACHE_DIR=./build-cache] - 在有网环境导出构建缓存"
|
@echo ""
|
||||||
|
@echo "【有网环境】"
|
||||||
|
@echo " make offline-export [CACHE_DIR=./build-cache] - 导出构建缓存"
|
||||||
|
@echo " make offline-export-full - 导出完整缓存(含 APT 预装基础镜像)"
|
||||||
|
@echo " make offline-build-base-images - 构建 APT 预装基础镜像"
|
||||||
|
@echo ""
|
||||||
|
@echo "【无网环境】"
|
||||||
@echo " make offline-setup [CACHE_DIR=./build-cache] - 解压并准备离线缓存"
|
@echo " make offline-setup [CACHE_DIR=./build-cache] - 解压并准备离线缓存"
|
||||||
@echo " make offline-build [CACHE_DIR=./build-cache] - 在无网环境构建所有服务(BuildKit)"
|
@echo " make offline-build-final - 使用预装基础镜像构建(推荐,解决 APT 问题)"
|
||||||
@echo " make offline-build-classic - 使用传统 docker build(更稳定)"
|
@echo " make offline-build-classic - 使用传统 docker build"
|
||||||
@echo " make <service>-offline-build - 离线构建单个服务"
|
@echo " make offline-build - 使用 BuildKit 构建"
|
||||||
@echo " (如: make backend-offline-build)"
|
|
||||||
@echo " make offline-diagnose - 诊断离线构建环境"
|
@echo " make offline-diagnose - 诊断离线构建环境"
|
||||||
|
@echo " make <service>-offline-build - 离线构建单个服务"
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo "完整工作流程:"
|
@echo "【完整工作流程(推荐)】"
|
||||||
@echo " # 1. 有网环境导出缓存"
|
@echo " # 1. 有网环境导出完整缓存"
|
||||||
@echo " make offline-export"
|
@echo " make offline-export-full"
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo " # 2. 传输缓存到无网环境"
|
@echo " # 2. 传输到无网环境(需要传输两个文件)"
|
||||||
@echo " scp build-cache-*.tar.gz user@offline-server:/path/to/project/"
|
@echo " scp build-cache/images/base-images-with-apt.tar user@offline-server:/path/"
|
||||||
|
@echo " scp build-cache-*.tar.gz user@offline-server:/path/"
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo " # 3. 无网环境构建(推荐先用传统方式)"
|
@echo " # 3. 无网环境构建"
|
||||||
@echo " tar -xzf build-cache-*.tar.gz"
|
@echo " tar -xzf build-cache-*.tar.gz"
|
||||||
@echo " make offline-diagnose # 检查环境"
|
@echo " docker load -i build-cache/images/base-images-with-apt.tar"
|
||||||
@echo " make offline-build-classic # 传统构建(推荐)"
|
@echo " make offline-build-final"
|
||||||
@echo " # 或 make offline-build # BuildKit 构建"
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
package com.datamate.datamanagement.application;
|
package com.datamate.datamanagement.application;
|
||||||
|
|
||||||
|
import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
|
||||||
import com.baomidou.mybatisplus.core.metadata.IPage;
|
import com.baomidou.mybatisplus.core.metadata.IPage;
|
||||||
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
|
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
|
||||||
import com.datamate.common.domain.utils.ChunksSaver;
|
import com.datamate.common.domain.utils.ChunksSaver;
|
||||||
@@ -101,6 +102,7 @@ public class DatasetApplicationService {
|
|||||||
public Dataset updateDataset(String datasetId, UpdateDatasetRequest updateDatasetRequest) {
|
public Dataset updateDataset(String datasetId, UpdateDatasetRequest updateDatasetRequest) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
|
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
|
||||||
|
|
||||||
if (StringUtils.hasText(updateDatasetRequest.getName())) {
|
if (StringUtils.hasText(updateDatasetRequest.getName())) {
|
||||||
dataset.setName(updateDatasetRequest.getName());
|
dataset.setName(updateDatasetRequest.getName());
|
||||||
}
|
}
|
||||||
@@ -113,13 +115,31 @@ public class DatasetApplicationService {
|
|||||||
if (Objects.nonNull(updateDatasetRequest.getStatus())) {
|
if (Objects.nonNull(updateDatasetRequest.getStatus())) {
|
||||||
dataset.setStatus(updateDatasetRequest.getStatus());
|
dataset.setStatus(updateDatasetRequest.getStatus());
|
||||||
}
|
}
|
||||||
if (updateDatasetRequest.getParentDatasetId() != null) {
|
if (updateDatasetRequest.isParentDatasetIdProvided()) {
|
||||||
|
// 保存原始的 parentDatasetId 值,用于比较是否发生了变化
|
||||||
|
String originalParentDatasetId = dataset.getParentDatasetId();
|
||||||
|
|
||||||
|
// 处理父数据集变更:仅当请求显式包含 parentDatasetId 时处理
|
||||||
|
// handleParentChange 内部通过 normalizeParentId 方法将空字符串和 null 都转换为 null
|
||||||
|
// 这样既支持设置新的父数据集,也支持清除关联
|
||||||
handleParentChange(dataset, updateDatasetRequest.getParentDatasetId());
|
handleParentChange(dataset, updateDatasetRequest.getParentDatasetId());
|
||||||
|
|
||||||
|
// 检查 parentDatasetId 是否发生了变化
|
||||||
|
if (!Objects.equals(originalParentDatasetId, dataset.getParentDatasetId())) {
|
||||||
|
// 使用 LambdaUpdateWrapper 显式地更新 parentDatasetId 字段
|
||||||
|
// 这样即使值为 null 也能被正确更新到数据库
|
||||||
|
datasetRepository.update(null, new LambdaUpdateWrapper<Dataset>()
|
||||||
|
.eq(Dataset::getId, datasetId)
|
||||||
|
.set(Dataset::getParentDatasetId, dataset.getParentDatasetId()));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (StringUtils.hasText(updateDatasetRequest.getDataSource())) {
|
if (StringUtils.hasText(updateDatasetRequest.getDataSource())) {
|
||||||
// 数据源id不为空,使用异步线程进行文件扫盘落库
|
// 数据源id不为空,使用异步线程进行文件扫盘落库
|
||||||
processDataSourceAsync(dataset.getId(), updateDatasetRequest.getDataSource());
|
processDataSourceAsync(dataset.getId(), updateDatasetRequest.getDataSource());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 更新其他字段(不包括 parentDatasetId,因为它已经在上面的代码中更新了)
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
return dataset;
|
return dataset;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
package com.datamate.datamanagement.interfaces.dto;
|
package com.datamate.datamanagement.interfaces.dto;
|
||||||
|
|
||||||
import com.datamate.datamanagement.common.enums.DatasetStatusType;
|
import com.datamate.datamanagement.common.enums.DatasetStatusType;
|
||||||
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import jakarta.validation.constraints.NotBlank;
|
import jakarta.validation.constraints.NotBlank;
|
||||||
import jakarta.validation.constraints.Size;
|
import jakarta.validation.constraints.Size;
|
||||||
|
import lombok.AccessLevel;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.Setter;
|
import lombok.Setter;
|
||||||
|
|
||||||
@@ -24,9 +26,18 @@ public class UpdateDatasetRequest {
|
|||||||
/** 归集任务id */
|
/** 归集任务id */
|
||||||
private String dataSource;
|
private String dataSource;
|
||||||
/** 父数据集ID */
|
/** 父数据集ID */
|
||||||
|
@Setter(AccessLevel.NONE)
|
||||||
private String parentDatasetId;
|
private String parentDatasetId;
|
||||||
|
@JsonIgnore
|
||||||
|
@Setter(AccessLevel.NONE)
|
||||||
|
private boolean parentDatasetIdProvided;
|
||||||
/** 标签列表 */
|
/** 标签列表 */
|
||||||
private List<String> tags;
|
private List<String> tags;
|
||||||
/** 数据集状态 */
|
/** 数据集状态 */
|
||||||
private DatasetStatusType status;
|
private DatasetStatusType status;
|
||||||
|
|
||||||
|
public void setParentDatasetId(String parentDatasetId) {
|
||||||
|
this.parentDatasetIdProvided = true;
|
||||||
|
this.parentDatasetId = parentDatasetId;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { TaskItem } from "@/pages/DataManagement/dataset.model";
|
import { TaskItem } from "@/pages/DataManagement/dataset.model";
|
||||||
import { calculateSHA256, checkIsFilesExist } from "@/utils/file.util";
|
import { calculateSHA256, checkIsFilesExist, streamSplitAndUpload, StreamUploadResult } from "@/utils/file.util";
|
||||||
import { App } from "antd";
|
import { App } from "antd";
|
||||||
import { useRef, useState } from "react";
|
import { useRef, useState } from "react";
|
||||||
|
|
||||||
@@ -9,17 +9,18 @@ export function useFileSliceUpload(
|
|||||||
uploadChunk,
|
uploadChunk,
|
||||||
cancelUpload,
|
cancelUpload,
|
||||||
}: {
|
}: {
|
||||||
preUpload: (id: string, params: any) => Promise<{ data: number }>;
|
preUpload: (id: string, params: Record<string, unknown>) => Promise<{ data: number }>;
|
||||||
uploadChunk: (id: string, formData: FormData, config: any) => Promise<any>;
|
uploadChunk: (id: string, formData: FormData, config: Record<string, unknown>) => Promise<unknown>;
|
||||||
cancelUpload: ((reqId: number) => Promise<any>) | null;
|
cancelUpload: ((reqId: number) => Promise<unknown>) | null;
|
||||||
},
|
},
|
||||||
showTaskCenter = true // 上传时是否显示任务中心
|
showTaskCenter = true, // 上传时是否显示任务中心
|
||||||
|
enableStreamUpload = true // 是否启用流式分割上传
|
||||||
) {
|
) {
|
||||||
const { message } = App.useApp();
|
const { message } = App.useApp();
|
||||||
const [taskList, setTaskList] = useState<TaskItem[]>([]);
|
const [taskList, setTaskList] = useState<TaskItem[]>([]);
|
||||||
const taskListRef = useRef<TaskItem[]>([]); // 用于固定任务顺序
|
const taskListRef = useRef<TaskItem[]>([]); // 用于固定任务顺序
|
||||||
|
|
||||||
const createTask = (detail: any = {}) => {
|
const createTask = (detail: Record<string, unknown> = {}) => {
|
||||||
const { dataset } = detail;
|
const { dataset } = detail;
|
||||||
const title = `上传数据集: ${dataset.name} `;
|
const title = `上传数据集: ${dataset.name} `;
|
||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
@@ -37,6 +38,14 @@ export function useFileSliceUpload(
|
|||||||
taskListRef.current = [task, ...taskListRef.current];
|
taskListRef.current = [task, ...taskListRef.current];
|
||||||
|
|
||||||
setTaskList(taskListRef.current);
|
setTaskList(taskListRef.current);
|
||||||
|
|
||||||
|
// 立即显示任务中心,让用户感知上传已开始
|
||||||
|
if (showTaskCenter) {
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("show:task-popover", { detail: { show: true } })
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return task;
|
return task;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -60,7 +69,7 @@ export function useFileSliceUpload(
|
|||||||
// 携带前缀信息,便于刷新后仍停留在当前目录
|
// 携带前缀信息,便于刷新后仍停留在当前目录
|
||||||
window.dispatchEvent(
|
window.dispatchEvent(
|
||||||
new CustomEvent(task.updateEvent, {
|
new CustomEvent(task.updateEvent, {
|
||||||
detail: { prefix: (task as any).prefix },
|
detail: { prefix: task.prefix },
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -71,7 +80,7 @@ export function useFileSliceUpload(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
async function buildFormData({ file, reqId, i, j }) {
|
async function buildFormData({ file, reqId, i, j }: { file: { slices: Blob[]; name: string; size: number }; reqId: number; i: number; j: number }) {
|
||||||
const formData = new FormData();
|
const formData = new FormData();
|
||||||
const { slices, name, size } = file;
|
const { slices, name, size } = file;
|
||||||
const checkSum = await calculateSHA256(slices[j]);
|
const checkSum = await calculateSHA256(slices[j]);
|
||||||
@@ -86,12 +95,18 @@ export function useFileSliceUpload(
|
|||||||
return formData;
|
return formData;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function uploadSlice(task: TaskItem, fileInfo) {
|
async function uploadSlice(task: TaskItem, fileInfo: { loaded: number; i: number; j: number; files: { slices: Blob[]; name: string; size: number }[]; totalSize: number }) {
|
||||||
if (!task) {
|
if (!task) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const { reqId, key } = task;
|
const { reqId, key, controller } = task;
|
||||||
const { loaded, i, j, files, totalSize } = fileInfo;
|
const { loaded, i, j, files, totalSize } = fileInfo;
|
||||||
|
|
||||||
|
// 检查是否已取消
|
||||||
|
if (controller.signal.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
const formData = await buildFormData({
|
const formData = await buildFormData({
|
||||||
file: files[i],
|
file: files[i],
|
||||||
i,
|
i,
|
||||||
@@ -101,6 +116,7 @@ export function useFileSliceUpload(
|
|||||||
|
|
||||||
let newTask = { ...task };
|
let newTask = { ...task };
|
||||||
await uploadChunk(key, formData, {
|
await uploadChunk(key, formData, {
|
||||||
|
signal: controller.signal,
|
||||||
onUploadProgress: (e) => {
|
onUploadProgress: (e) => {
|
||||||
const loadedSize = loaded + e.loaded;
|
const loadedSize = loaded + e.loaded;
|
||||||
const curPercent = Number((loadedSize / totalSize) * 100).toFixed(2);
|
const curPercent = Number((loadedSize / totalSize) * 100).toFixed(2);
|
||||||
@@ -116,7 +132,7 @@ export function useFileSliceUpload(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function uploadFile({ task, files, totalSize }) {
|
async function uploadFile({ task, files, totalSize }: { task: TaskItem; files: { slices: Blob[]; name: string; size: number; originFile: Blob }[]; totalSize: number }) {
|
||||||
console.log('[useSliceUpload] Calling preUpload with prefix:', task.prefix);
|
console.log('[useSliceUpload] Calling preUpload with prefix:', task.prefix);
|
||||||
const { data: reqId } = await preUpload(task.key, {
|
const { data: reqId } = await preUpload(task.key, {
|
||||||
totalFileNum: files.length,
|
totalFileNum: files.length,
|
||||||
@@ -132,24 +148,29 @@ export function useFileSliceUpload(
|
|||||||
reqId,
|
reqId,
|
||||||
isCancel: false,
|
isCancel: false,
|
||||||
cancelFn: () => {
|
cancelFn: () => {
|
||||||
task.controller.abort();
|
// 使用 newTask 的 controller 确保一致性
|
||||||
|
newTask.controller.abort();
|
||||||
cancelUpload?.(reqId);
|
cancelUpload?.(reqId);
|
||||||
if (task.updateEvent) window.dispatchEvent(new Event(task.updateEvent));
|
if (newTask.updateEvent) window.dispatchEvent(new Event(newTask.updateEvent));
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
updateTaskList(newTask);
|
updateTaskList(newTask);
|
||||||
if (showTaskCenter) {
|
// 注意:show:task-popover 事件已在 createTask 中触发,此处不再重复触发
|
||||||
window.dispatchEvent(
|
|
||||||
new CustomEvent("show:task-popover", { detail: { show: true } })
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// // 更新数据状态
|
// // 更新数据状态
|
||||||
if (task.updateEvent) window.dispatchEvent(new Event(task.updateEvent));
|
if (task.updateEvent) window.dispatchEvent(new Event(task.updateEvent));
|
||||||
|
|
||||||
let loaded = 0;
|
let loaded = 0;
|
||||||
for (let i = 0; i < files.length; i++) {
|
for (let i = 0; i < files.length; i++) {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (newTask.controller.signal.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
const { slices } = files[i];
|
const { slices } = files[i];
|
||||||
for (let j = 0; j < slices.length; j++) {
|
for (let j = 0; j < slices.length; j++) {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (newTask.controller.signal.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
await uploadSlice(newTask, {
|
await uploadSlice(newTask, {
|
||||||
loaded,
|
loaded,
|
||||||
i,
|
i,
|
||||||
@@ -163,7 +184,7 @@ export function useFileSliceUpload(
|
|||||||
removeTask(newTask);
|
removeTask(newTask);
|
||||||
}
|
}
|
||||||
|
|
||||||
const handleUpload = async ({ task, files }) => {
|
const handleUpload = async ({ task, files }: { task: TaskItem; files: { slices: Blob[]; name: string; size: number; originFile: Blob }[] }) => {
|
||||||
const isErrorFile = await checkIsFilesExist(files);
|
const isErrorFile = await checkIsFilesExist(files);
|
||||||
if (isErrorFile) {
|
if (isErrorFile) {
|
||||||
message.error("文件被修改或删除,请重新选择文件上传");
|
message.error("文件被修改或删除,请重新选择文件上传");
|
||||||
@@ -189,10 +210,175 @@ export function useFileSliceUpload(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 流式分割上传处理
|
||||||
|
* 用于大文件按行分割并立即上传的场景
|
||||||
|
*/
|
||||||
|
const handleStreamUpload = async ({ task, files }: { task: TaskItem; files: File[] }) => {
|
||||||
|
try {
|
||||||
|
console.log('[useSliceUpload] Starting stream upload for', files.length, 'files');
|
||||||
|
|
||||||
|
const totalSize = files.reduce((acc, file) => acc + file.size, 0);
|
||||||
|
|
||||||
|
// 存储所有文件的 reqId,用于取消上传
|
||||||
|
const reqIds: number[] = [];
|
||||||
|
|
||||||
|
const newTask: TaskItem = {
|
||||||
|
...task,
|
||||||
|
reqId: -1,
|
||||||
|
isCancel: false,
|
||||||
|
cancelFn: () => {
|
||||||
|
// 使用 newTask 的 controller 确保一致性
|
||||||
|
newTask.controller.abort();
|
||||||
|
// 取消所有文件的预上传请求
|
||||||
|
reqIds.forEach(id => cancelUpload?.(id));
|
||||||
|
if (newTask.updateEvent) window.dispatchEvent(new Event(newTask.updateEvent));
|
||||||
|
},
|
||||||
|
};
|
||||||
|
updateTaskList(newTask);
|
||||||
|
|
||||||
|
let totalUploadedLines = 0;
|
||||||
|
let totalProcessedBytes = 0;
|
||||||
|
const results: StreamUploadResult[] = [];
|
||||||
|
|
||||||
|
// 逐个处理文件,每个文件单独调用 preUpload
|
||||||
|
for (let i = 0; i < files.length; i++) {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (newTask.controller.signal.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
|
const file = files[i];
|
||||||
|
console.log(`[useSliceUpload] Processing file ${i + 1}/${files.length}: ${file.name}`);
|
||||||
|
|
||||||
|
// 为每个文件单独调用 preUpload,获取独立的 reqId
|
||||||
|
const { data: reqId } = await preUpload(task.key, {
|
||||||
|
totalFileNum: 1,
|
||||||
|
totalSize: file.size,
|
||||||
|
datasetId: task.key,
|
||||||
|
hasArchive: task.hasArchive,
|
||||||
|
prefix: task.prefix,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[useSliceUpload] File ${file.name} preUpload response reqId:`, reqId);
|
||||||
|
reqIds.push(reqId);
|
||||||
|
|
||||||
|
const result = await streamSplitAndUpload(
|
||||||
|
file,
|
||||||
|
(formData, config) => uploadChunk(task.key, formData, {
|
||||||
|
...config,
|
||||||
|
signal: newTask.controller.signal,
|
||||||
|
}),
|
||||||
|
(currentBytes, totalBytes, uploadedLines) => {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (newTask.controller.signal.aborted) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 更新进度
|
||||||
|
const overallBytes = totalProcessedBytes + currentBytes;
|
||||||
|
const curPercent = Number((overallBytes / totalSize) * 100).toFixed(2);
|
||||||
|
|
||||||
|
const updatedTask: TaskItem = {
|
||||||
|
...newTask,
|
||||||
|
...taskListRef.current.find((item) => item.key === task.key),
|
||||||
|
size: overallBytes,
|
||||||
|
percent: curPercent >= 100 ? 99.99 : curPercent,
|
||||||
|
streamUploadInfo: {
|
||||||
|
currentFile: file.name,
|
||||||
|
fileIndex: i + 1,
|
||||||
|
totalFiles: files.length,
|
||||||
|
uploadedLines: totalUploadedLines + uploadedLines,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
updateTaskList(updatedTask);
|
||||||
|
},
|
||||||
|
1024 * 1024, // 1MB chunk size
|
||||||
|
{
|
||||||
|
reqId,
|
||||||
|
hasArchive: newTask.hasArchive,
|
||||||
|
prefix: newTask.prefix,
|
||||||
|
signal: newTask.controller.signal,
|
||||||
|
maxConcurrency: 3,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
results.push(result);
|
||||||
|
totalUploadedLines += result.uploadedCount;
|
||||||
|
totalProcessedBytes += file.size;
|
||||||
|
|
||||||
|
console.log(`[useSliceUpload] File ${file.name} processed, uploaded ${result.uploadedCount} lines`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[useSliceUpload] Stream upload completed, total lines:', totalUploadedLines);
|
||||||
|
removeTask(newTask);
|
||||||
|
|
||||||
|
message.success(`成功上传 ${totalUploadedLines} 个文件(按行分割)`);
|
||||||
|
} catch (err) {
|
||||||
|
console.error('[useSliceUpload] Stream upload error:', err);
|
||||||
|
if (err.message === "Upload cancelled") {
|
||||||
|
message.info("上传已取消");
|
||||||
|
} else {
|
||||||
|
message.error("文件上传失败,请稍后重试");
|
||||||
|
}
|
||||||
|
removeTask({
|
||||||
|
...task,
|
||||||
|
isCancel: true,
|
||||||
|
...taskListRef.current.find((item) => item.key === task.key),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 注册流式上传事件监听
|
||||||
|
* 返回注销函数
|
||||||
|
*/
|
||||||
|
const registerStreamUploadListener = () => {
|
||||||
|
if (!enableStreamUpload) return () => {};
|
||||||
|
|
||||||
|
const streamUploadHandler = async (e: Event) => {
|
||||||
|
const customEvent = e as CustomEvent;
|
||||||
|
const { dataset, files, updateEvent, hasArchive, prefix } = customEvent.detail;
|
||||||
|
|
||||||
|
const controller = new AbortController();
|
||||||
|
const task: TaskItem = {
|
||||||
|
key: dataset.id,
|
||||||
|
title: `上传数据集: ${dataset.name} (按行分割)`,
|
||||||
|
percent: 0,
|
||||||
|
reqId: -1,
|
||||||
|
controller,
|
||||||
|
size: 0,
|
||||||
|
updateEvent,
|
||||||
|
hasArchive,
|
||||||
|
prefix,
|
||||||
|
};
|
||||||
|
|
||||||
|
taskListRef.current = [task, ...taskListRef.current];
|
||||||
|
setTaskList(taskListRef.current);
|
||||||
|
|
||||||
|
// 显示任务中心
|
||||||
|
if (showTaskCenter) {
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("show:task-popover", { detail: { show: true } })
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
await handleStreamUpload({ task, files });
|
||||||
|
};
|
||||||
|
|
||||||
|
window.addEventListener("upload:dataset-stream", streamUploadHandler);
|
||||||
|
|
||||||
|
return () => {
|
||||||
|
window.removeEventListener("upload:dataset-stream", streamUploadHandler);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
return {
|
return {
|
||||||
taskList,
|
taskList,
|
||||||
createTask,
|
createTask,
|
||||||
removeTask,
|
removeTask,
|
||||||
handleUpload,
|
handleUpload,
|
||||||
|
handleStreamUpload,
|
||||||
|
registerStreamUploadListener,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,14 +4,9 @@ import { ArrowLeft } from "lucide-react";
|
|||||||
import { Button, Form, App } from "antd";
|
import { Button, Form, App } from "antd";
|
||||||
import { Link, useLocation, useNavigate } from "react-router";
|
import { Link, useLocation, useNavigate } from "react-router";
|
||||||
import { createDatasetUsingPost } from "../dataset.api";
|
import { createDatasetUsingPost } from "../dataset.api";
|
||||||
import { datasetTypes } from "../dataset.const";
|
|
||||||
import { DatasetType } from "../dataset.model";
|
import { DatasetType } from "../dataset.model";
|
||||||
import BasicInformation from "./components/BasicInformation";
|
import BasicInformation from "./components/BasicInformation";
|
||||||
|
|
||||||
const textDatasetTypeOptions = datasetTypes.filter(
|
|
||||||
(type) => type.value === DatasetType.TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
export default function DatasetCreate() {
|
export default function DatasetCreate() {
|
||||||
const navigate = useNavigate();
|
const navigate = useNavigate();
|
||||||
const location = useLocation();
|
const location = useLocation();
|
||||||
@@ -87,7 +82,6 @@ export default function DatasetCreate() {
|
|||||||
data={newDataset}
|
data={newDataset}
|
||||||
setData={setNewDataset}
|
setData={setNewDataset}
|
||||||
hidden={["dataSource"]}
|
hidden={["dataSource"]}
|
||||||
datasetTypeOptions={textDatasetTypeOptions}
|
|
||||||
/>
|
/>
|
||||||
</Form>
|
</Form>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import { Dataset, DatasetType, DataSource } from "../../dataset.model";
|
|||||||
import { useCallback, useEffect, useMemo, useState } from "react";
|
import { useCallback, useEffect, useMemo, useState } from "react";
|
||||||
import { queryTasksUsingGet } from "@/pages/DataCollection/collection.apis";
|
import { queryTasksUsingGet } from "@/pages/DataCollection/collection.apis";
|
||||||
import { updateDatasetByIdUsingPut } from "../../dataset.api";
|
import { updateDatasetByIdUsingPut } from "../../dataset.api";
|
||||||
import { sliceFile } from "@/utils/file.util";
|
import { sliceFile, shouldStreamUpload } from "@/utils/file.util";
|
||||||
import Dragger from "antd/es/upload/Dragger";
|
import Dragger from "antd/es/upload/Dragger";
|
||||||
|
|
||||||
const TEXT_FILE_MIME_PREFIX = "text/";
|
const TEXT_FILE_MIME_PREFIX = "text/";
|
||||||
@@ -90,14 +90,16 @@ async function splitFileByLines(file: UploadFile): Promise<UploadFile[]> {
|
|||||||
const lines = text.split(/\r?\n/).filter((line: string) => line.trim() !== "");
|
const lines = text.split(/\r?\n/).filter((line: string) => line.trim() !== "");
|
||||||
if (lines.length === 0) return [];
|
if (lines.length === 0) return [];
|
||||||
|
|
||||||
// 生成文件名:原文件名_序号.扩展名
|
// 生成文件名:原文件名_序号(不保留后缀)
|
||||||
const nameParts = file.name.split(".");
|
const nameParts = file.name.split(".");
|
||||||
const ext = nameParts.length > 1 ? "." + nameParts.pop() : "";
|
if (nameParts.length > 1) {
|
||||||
|
nameParts.pop();
|
||||||
|
}
|
||||||
const baseName = nameParts.join(".");
|
const baseName = nameParts.join(".");
|
||||||
const padLength = String(lines.length).length;
|
const padLength = String(lines.length).length;
|
||||||
|
|
||||||
return lines.map((line: string, index: number) => {
|
return lines.map((line: string, index: number) => {
|
||||||
const newFileName = `${baseName}_${String(index + 1).padStart(padLength, "0")}${ext}`;
|
const newFileName = `${baseName}_${String(index + 1).padStart(padLength, "0")}`;
|
||||||
const blob = new Blob([line], { type: "text/plain" });
|
const blob = new Blob([line], { type: "text/plain" });
|
||||||
const newFile = new File([blob], newFileName, { type: "text/plain" });
|
const newFile = new File([blob], newFileName, { type: "text/plain" });
|
||||||
return {
|
return {
|
||||||
@@ -164,17 +166,75 @@ export default function ImportConfiguration({
|
|||||||
// 本地上传文件相关逻辑
|
// 本地上传文件相关逻辑
|
||||||
|
|
||||||
const handleUpload = async (dataset: Dataset) => {
|
const handleUpload = async (dataset: Dataset) => {
|
||||||
let filesToUpload =
|
const filesToUpload =
|
||||||
(form.getFieldValue("files") as UploadFile[] | undefined) || [];
|
(form.getFieldValue("files") as UploadFile[] | undefined) || [];
|
||||||
|
|
||||||
// 如果启用分行分割,处理文件
|
// 如果启用分行分割,对大文件使用流式处理
|
||||||
if (importConfig.splitByLine && !hasNonTextFile) {
|
if (importConfig.splitByLine && !hasNonTextFile) {
|
||||||
const splitResults = await Promise.all(
|
// 检查是否有大文件需要流式分割上传
|
||||||
filesToUpload.map((file) => splitFileByLines(file))
|
const filesForStreamUpload: File[] = [];
|
||||||
);
|
const filesForNormalUpload: UploadFile[] = [];
|
||||||
filesToUpload = splitResults.flat();
|
|
||||||
|
for (const file of filesToUpload) {
|
||||||
|
const originFile = file.originFileObj ?? file;
|
||||||
|
if (originFile instanceof File && shouldStreamUpload(originFile)) {
|
||||||
|
filesForStreamUpload.push(originFile);
|
||||||
|
} else {
|
||||||
|
filesForNormalUpload.push(file);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 大文件使用流式分割上传
|
||||||
|
if (filesForStreamUpload.length > 0) {
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("upload:dataset-stream", {
|
||||||
|
detail: {
|
||||||
|
dataset,
|
||||||
|
files: filesForStreamUpload,
|
||||||
|
updateEvent,
|
||||||
|
hasArchive: importConfig.hasArchive,
|
||||||
|
prefix: currentPrefix,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 小文件使用传统分割方式
|
||||||
|
if (filesForNormalUpload.length > 0) {
|
||||||
|
const splitResults = await Promise.all(
|
||||||
|
filesForNormalUpload.map((file) => splitFileByLines(file))
|
||||||
|
);
|
||||||
|
const smallFilesToUpload = splitResults.flat();
|
||||||
|
|
||||||
|
// 计算分片列表
|
||||||
|
const sliceList = smallFilesToUpload.map((file) => {
|
||||||
|
const originFile = (file.originFileObj ?? file) as Blob;
|
||||||
|
const slices = sliceFile(originFile);
|
||||||
|
return {
|
||||||
|
originFile: originFile,
|
||||||
|
slices,
|
||||||
|
name: file.name,
|
||||||
|
size: originFile.size || 0,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log("[ImportConfiguration] Uploading small files with currentPrefix:", currentPrefix);
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("upload:dataset", {
|
||||||
|
detail: {
|
||||||
|
dataset,
|
||||||
|
files: sliceList,
|
||||||
|
updateEvent,
|
||||||
|
hasArchive: importConfig.hasArchive,
|
||||||
|
prefix: currentPrefix,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 未启用分行分割,使用普通上传
|
||||||
// 计算分片列表
|
// 计算分片列表
|
||||||
const sliceList = filesToUpload.map((file) => {
|
const sliceList = filesToUpload.map((file) => {
|
||||||
const originFile = (file.originFileObj ?? file) as Blob;
|
const originFile = (file.originFileObj ?? file) as Blob;
|
||||||
@@ -234,6 +294,10 @@ export default function ImportConfiguration({
|
|||||||
if (!data) return;
|
if (!data) return;
|
||||||
console.log('[ImportConfiguration] handleImportData called, currentPrefix:', currentPrefix);
|
console.log('[ImportConfiguration] handleImportData called, currentPrefix:', currentPrefix);
|
||||||
if (importConfig.source === DataSource.UPLOAD) {
|
if (importConfig.source === DataSource.UPLOAD) {
|
||||||
|
// 立即显示任务中心,让用户感知上传已开始(在文件分割等耗时操作之前)
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("show:task-popover", { detail: { show: true } })
|
||||||
|
);
|
||||||
await handleUpload(data);
|
await handleUpload(data);
|
||||||
} else if (importConfig.source === DataSource.COLLECTION) {
|
} else if (importConfig.source === DataSource.COLLECTION) {
|
||||||
await updateDatasetByIdUsingPut(data.id, {
|
await updateDatasetByIdUsingPut(data.id, {
|
||||||
|
|||||||
@@ -102,6 +102,13 @@ export interface DatasetTask {
|
|||||||
executionHistory?: { time: string; status: string }[];
|
executionHistory?: { time: string; status: string }[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface StreamUploadInfo {
|
||||||
|
currentFile: string;
|
||||||
|
fileIndex: number;
|
||||||
|
totalFiles: number;
|
||||||
|
uploadedLines: number;
|
||||||
|
}
|
||||||
|
|
||||||
export interface TaskItem {
|
export interface TaskItem {
|
||||||
key: string;
|
key: string;
|
||||||
title: string;
|
title: string;
|
||||||
@@ -113,4 +120,6 @@ export interface TaskItem {
|
|||||||
updateEvent?: string;
|
updateEvent?: string;
|
||||||
size?: number;
|
size?: number;
|
||||||
hasArchive?: boolean;
|
hasArchive?: boolean;
|
||||||
|
prefix?: string;
|
||||||
|
streamUploadInfo?: StreamUploadInfo;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,25 +3,28 @@ import {
|
|||||||
preUploadUsingPost,
|
preUploadUsingPost,
|
||||||
uploadFileChunkUsingPost,
|
uploadFileChunkUsingPost,
|
||||||
} from "@/pages/DataManagement/dataset.api";
|
} from "@/pages/DataManagement/dataset.api";
|
||||||
import { Button, Empty, Progress } from "antd";
|
import { Button, Empty, Progress, Tag } from "antd";
|
||||||
import { DeleteOutlined } from "@ant-design/icons";
|
import { DeleteOutlined, FileTextOutlined } from "@ant-design/icons";
|
||||||
import { useEffect } from "react";
|
import { useEffect } from "react";
|
||||||
import { useFileSliceUpload } from "@/hooks/useSliceUpload";
|
import { useFileSliceUpload } from "@/hooks/useSliceUpload";
|
||||||
|
|
||||||
export default function TaskUpload() {
|
export default function TaskUpload() {
|
||||||
const { createTask, taskList, removeTask, handleUpload } = useFileSliceUpload(
|
const { createTask, taskList, removeTask, handleUpload, registerStreamUploadListener } = useFileSliceUpload(
|
||||||
{
|
{
|
||||||
preUpload: preUploadUsingPost,
|
preUpload: preUploadUsingPost,
|
||||||
uploadChunk: uploadFileChunkUsingPost,
|
uploadChunk: uploadFileChunkUsingPost,
|
||||||
cancelUpload: cancelUploadUsingPut,
|
cancelUpload: cancelUploadUsingPut,
|
||||||
}
|
},
|
||||||
|
true, // showTaskCenter
|
||||||
|
true // enableStreamUpload
|
||||||
);
|
);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const uploadHandler = (e: any) => {
|
const uploadHandler = (e: Event) => {
|
||||||
console.log('[TaskUpload] Received upload event detail:', e.detail);
|
const customEvent = e as CustomEvent;
|
||||||
const { files } = e.detail;
|
console.log('[TaskUpload] Received upload event detail:', customEvent.detail);
|
||||||
const task = createTask(e.detail);
|
const { files } = customEvent.detail;
|
||||||
|
const task = createTask(customEvent.detail);
|
||||||
console.log('[TaskUpload] Created task with prefix:', task.prefix);
|
console.log('[TaskUpload] Created task with prefix:', task.prefix);
|
||||||
handleUpload({ task, files });
|
handleUpload({ task, files });
|
||||||
};
|
};
|
||||||
@@ -29,7 +32,13 @@ export default function TaskUpload() {
|
|||||||
return () => {
|
return () => {
|
||||||
window.removeEventListener("upload:dataset", uploadHandler);
|
window.removeEventListener("upload:dataset", uploadHandler);
|
||||||
};
|
};
|
||||||
}, []);
|
}, [createTask, handleUpload]);
|
||||||
|
|
||||||
|
// 注册流式上传监听器
|
||||||
|
useEffect(() => {
|
||||||
|
const unregister = registerStreamUploadListener();
|
||||||
|
return unregister;
|
||||||
|
}, [registerStreamUploadListener]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div
|
<div
|
||||||
@@ -55,7 +64,22 @@ export default function TaskUpload() {
|
|||||||
></Button>
|
></Button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<Progress size="small" percent={task.percent} />
|
<Progress size="small" percent={Number(task.percent)} />
|
||||||
|
{task.streamUploadInfo && (
|
||||||
|
<div className="flex items-center gap-2 text-xs text-gray-500 mt-1">
|
||||||
|
<Tag icon={<FileTextOutlined />} size="small">
|
||||||
|
按行分割
|
||||||
|
</Tag>
|
||||||
|
<span>
|
||||||
|
已上传: {task.streamUploadInfo.uploadedLines} 行
|
||||||
|
</span>
|
||||||
|
{task.streamUploadInfo.totalFiles > 1 && (
|
||||||
|
<span>
|
||||||
|
({task.streamUploadInfo.fileIndex}/{task.streamUploadInfo.totalFiles} 文件)
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
))}
|
))}
|
||||||
{taskList.length === 0 && (
|
{taskList.length === 0 && (
|
||||||
|
|||||||
@@ -1,79 +1,600 @@
|
|||||||
import { UploadFile } from "antd";
|
import { UploadFile } from "antd";
|
||||||
import jsSHA from "jssha";
|
import jsSHA from "jssha";
|
||||||
|
|
||||||
const CHUNK_SIZE = 1024 * 1024 * 60;
|
// 默认分片大小:5MB(适合大多数网络环境)
|
||||||
|
export const DEFAULT_CHUNK_SIZE = 1024 * 1024 * 5;
|
||||||
|
// 大文件阈值:10MB
|
||||||
|
export const LARGE_FILE_THRESHOLD = 1024 * 1024 * 10;
|
||||||
|
// 最大并发上传数
|
||||||
|
export const MAX_CONCURRENT_UPLOADS = 3;
|
||||||
|
// 文本文件读取块大小:20MB(用于计算 SHA256)
|
||||||
|
const BUFFER_CHUNK_SIZE = 1024 * 1024 * 20;
|
||||||
|
|
||||||
export function sliceFile(file, chunkSize = CHUNK_SIZE): Blob[] {
|
/**
|
||||||
|
* 将文件分割为多个分片
|
||||||
|
* @param file 文件对象
|
||||||
|
* @param chunkSize 分片大小(字节),默认 5MB
|
||||||
|
* @returns 分片数组(Blob 列表)
|
||||||
|
*/
|
||||||
|
export function sliceFile(file: Blob, chunkSize = DEFAULT_CHUNK_SIZE): Blob[] {
|
||||||
const totalSize = file.size;
|
const totalSize = file.size;
|
||||||
|
const chunks: Blob[] = [];
|
||||||
|
|
||||||
|
// 小文件不需要分片
|
||||||
|
if (totalSize <= chunkSize) {
|
||||||
|
return [file];
|
||||||
|
}
|
||||||
|
|
||||||
let start = 0;
|
let start = 0;
|
||||||
let end = start + chunkSize;
|
|
||||||
const chunks = [];
|
|
||||||
while (start < totalSize) {
|
while (start < totalSize) {
|
||||||
|
const end = Math.min(start + chunkSize, totalSize);
|
||||||
const blob = file.slice(start, end);
|
const blob = file.slice(start, end);
|
||||||
chunks.push(blob);
|
chunks.push(blob);
|
||||||
|
|
||||||
start = end;
|
start = end;
|
||||||
end = start + chunkSize;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function calculateSHA256(file: Blob): Promise<string> {
|
/**
|
||||||
let count = 0;
|
* 计算文件的 SHA256 哈希值
|
||||||
const hash = new jsSHA("SHA-256", "ARRAYBUFFER", { encoding: "UTF8" });
|
* @param file 文件 Blob
|
||||||
|
* @param onProgress 进度回调(可选)
|
||||||
|
* @returns SHA256 哈希字符串
|
||||||
|
*/
|
||||||
|
export function calculateSHA256(
|
||||||
|
file: Blob,
|
||||||
|
onProgress?: (percent: number) => void
|
||||||
|
): Promise<string> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
|
const hash = new jsSHA("SHA-256", "ARRAYBUFFER", { encoding: "UTF8" });
|
||||||
const reader = new FileReader();
|
const reader = new FileReader();
|
||||||
|
let processedSize = 0;
|
||||||
|
|
||||||
function readChunk(start: number, end: number) {
|
function readChunk(start: number, end: number) {
|
||||||
const slice = file.slice(start, end);
|
const slice = file.slice(start, end);
|
||||||
reader.readAsArrayBuffer(slice);
|
reader.readAsArrayBuffer(slice);
|
||||||
}
|
}
|
||||||
|
|
||||||
const bufferChunkSize = 1024 * 1024 * 20;
|
|
||||||
|
|
||||||
function processChunk(offset: number) {
|
function processChunk(offset: number) {
|
||||||
const start = offset;
|
const start = offset;
|
||||||
const end = Math.min(start + bufferChunkSize, file.size);
|
const end = Math.min(start + BUFFER_CHUNK_SIZE, file.size);
|
||||||
count = end;
|
|
||||||
|
|
||||||
readChunk(start, end);
|
readChunk(start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
reader.onloadend = function () {
|
reader.onloadend = function (e) {
|
||||||
const arraybuffer = reader.result;
|
const arraybuffer = reader.result as ArrayBuffer;
|
||||||
|
if (!arraybuffer) {
|
||||||
|
reject(new Error("Failed to read file"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
hash.update(arraybuffer);
|
hash.update(arraybuffer);
|
||||||
if (count < file.size) {
|
processedSize += (e.target as FileReader).result?.byteLength || 0;
|
||||||
processChunk(count);
|
|
||||||
|
if (onProgress) {
|
||||||
|
const percent = Math.min(100, Math.round((processedSize / file.size) * 100));
|
||||||
|
onProgress(percent);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (processedSize < file.size) {
|
||||||
|
processChunk(processedSize);
|
||||||
} else {
|
} else {
|
||||||
resolve(hash.getHash("HEX", { outputLen: 256 }));
|
resolve(hash.getHash("HEX", { outputLen: 256 }));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
reader.onerror = () => reject(new Error("File reading failed"));
|
||||||
processChunk(0);
|
processChunk(0);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 批量计算多个文件的 SHA256
|
||||||
|
* @param files 文件列表
|
||||||
|
* @param onFileProgress 单个文件进度回调(可选)
|
||||||
|
* @returns 哈希值数组
|
||||||
|
*/
|
||||||
|
export async function calculateSHA256Batch(
|
||||||
|
files: Blob[],
|
||||||
|
onFileProgress?: (index: number, percent: number) => void
|
||||||
|
): Promise<string[]> {
|
||||||
|
const results: string[] = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < files.length; i++) {
|
||||||
|
const hash = await calculateSHA256(files[i], (percent) => {
|
||||||
|
onFileProgress?.(i, percent);
|
||||||
|
});
|
||||||
|
results.push(hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 检查文件是否存在(未被修改或删除)
|
||||||
|
* @param fileList 文件列表
|
||||||
|
* @returns 返回第一个不存在的文件,或 null(如果都存在)
|
||||||
|
*/
|
||||||
export function checkIsFilesExist(
|
export function checkIsFilesExist(
|
||||||
fileList: UploadFile[]
|
fileList: Array<{ originFile?: Blob }>
|
||||||
): Promise<UploadFile | null> {
|
): Promise<{ originFile?: Blob } | null> {
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
const loadEndFn = (file: UploadFile, reachEnd: boolean, e) => {
|
if (!fileList.length) {
|
||||||
const fileNotExist = !e.target.result;
|
resolve(null);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let checkedCount = 0;
|
||||||
|
const totalCount = fileList.length;
|
||||||
|
|
||||||
|
const loadEndFn = (file: { originFile?: Blob }, e: ProgressEvent<FileReader>) => {
|
||||||
|
checkedCount++;
|
||||||
|
const fileNotExist = !e.target?.result;
|
||||||
if (fileNotExist) {
|
if (fileNotExist) {
|
||||||
resolve(file);
|
resolve(file);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if (reachEnd) {
|
if (checkedCount >= totalCount) {
|
||||||
resolve(null);
|
resolve(null);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
for (let i = 0; i < fileList.length; i++) {
|
for (const file of fileList) {
|
||||||
const { originFile: file } = fileList[i];
|
|
||||||
const fileReader = new FileReader();
|
const fileReader = new FileReader();
|
||||||
fileReader.readAsArrayBuffer(file);
|
const actualFile = file.originFile;
|
||||||
fileReader.onloadend = (e) =>
|
|
||||||
loadEndFn(fileList[i], i === fileList.length - 1, e);
|
if (!actualFile) {
|
||||||
|
checkedCount++;
|
||||||
|
if (checkedCount >= totalCount) {
|
||||||
|
resolve(null);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
fileReader.readAsArrayBuffer(actualFile.slice(0, 1));
|
||||||
|
fileReader.onloadend = (e) => loadEndFn(file, e);
|
||||||
|
fileReader.onerror = () => {
|
||||||
|
checkedCount++;
|
||||||
|
resolve(file);
|
||||||
|
};
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断文件是否为大文件
|
||||||
|
* @param size 文件大小(字节)
|
||||||
|
* @param threshold 阈值(字节),默认 10MB
|
||||||
|
*/
|
||||||
|
export function isLargeFile(size: number, threshold = LARGE_FILE_THRESHOLD): boolean {
|
||||||
|
return size > threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 格式化文件大小为人类可读格式
|
||||||
|
* @param bytes 字节数
|
||||||
|
* @param decimals 小数位数
|
||||||
|
*/
|
||||||
|
export function formatFileSize(bytes: number, decimals = 2): string {
|
||||||
|
if (bytes === 0) return "0 B";
|
||||||
|
|
||||||
|
const k = 1024;
|
||||||
|
const sizes = ["B", "KB", "MB", "GB", "TB", "PB"];
|
||||||
|
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
||||||
|
|
||||||
|
return `${parseFloat((bytes / Math.pow(k, i)).toFixed(decimals))} ${sizes[i]}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 并发执行异步任务
|
||||||
|
* @param tasks 任务函数数组
|
||||||
|
* @param maxConcurrency 最大并发数
|
||||||
|
* @param onTaskComplete 单个任务完成回调(可选)
|
||||||
|
*/
|
||||||
|
export async function runConcurrentTasks<T>(
|
||||||
|
tasks: (() => Promise<T>)[],
|
||||||
|
maxConcurrency: number,
|
||||||
|
onTaskComplete?: (index: number, result: T) => void
|
||||||
|
): Promise<T[]> {
|
||||||
|
const results: T[] = new Array(tasks.length);
|
||||||
|
let index = 0;
|
||||||
|
|
||||||
|
async function runNext(): Promise<void> {
|
||||||
|
const currentIndex = index++;
|
||||||
|
if (currentIndex >= tasks.length) return;
|
||||||
|
|
||||||
|
const result = await tasks[currentIndex]();
|
||||||
|
results[currentIndex] = result;
|
||||||
|
onTaskComplete?.(currentIndex, result);
|
||||||
|
|
||||||
|
await runNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
const workers = Array(Math.min(maxConcurrency, tasks.length))
|
||||||
|
.fill(null)
|
||||||
|
.map(() => runNext());
|
||||||
|
|
||||||
|
await Promise.all(workers);
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 按行分割文本文件内容
|
||||||
|
* @param text 文本内容
|
||||||
|
* @param skipEmptyLines 是否跳过空行,默认 true
|
||||||
|
* @returns 行数组
|
||||||
|
*/
|
||||||
|
export function splitTextByLines(text: string, skipEmptyLines = true): string[] {
|
||||||
|
const lines = text.split(/\r?\n/);
|
||||||
|
if (skipEmptyLines) {
|
||||||
|
return lines.filter((line) => line.trim() !== "");
|
||||||
|
}
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建分片信息对象
|
||||||
|
* @param file 原始文件
|
||||||
|
* @param chunkSize 分片大小
|
||||||
|
*/
|
||||||
|
export function createFileSliceInfo(
|
||||||
|
file: File | Blob,
|
||||||
|
chunkSize = DEFAULT_CHUNK_SIZE
|
||||||
|
): {
|
||||||
|
originFile: Blob;
|
||||||
|
slices: Blob[];
|
||||||
|
name: string;
|
||||||
|
size: number;
|
||||||
|
totalChunks: number;
|
||||||
|
} {
|
||||||
|
const slices = sliceFile(file, chunkSize);
|
||||||
|
return {
|
||||||
|
originFile: file,
|
||||||
|
slices,
|
||||||
|
name: (file as File).name || "unnamed",
|
||||||
|
size: file.size,
|
||||||
|
totalChunks: slices.length,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 支持的文本文件 MIME 类型前缀
|
||||||
|
*/
|
||||||
|
export const TEXT_FILE_MIME_PREFIX = "text/";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 支持的文本文件 MIME 类型集合
|
||||||
|
*/
|
||||||
|
export const TEXT_FILE_MIME_TYPES = new Set([
|
||||||
|
"application/json",
|
||||||
|
"application/xml",
|
||||||
|
"application/csv",
|
||||||
|
"application/ndjson",
|
||||||
|
"application/x-ndjson",
|
||||||
|
"application/x-yaml",
|
||||||
|
"application/yaml",
|
||||||
|
"application/javascript",
|
||||||
|
"application/x-javascript",
|
||||||
|
"application/sql",
|
||||||
|
"application/rtf",
|
||||||
|
"application/xhtml+xml",
|
||||||
|
"application/svg+xml",
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 支持的文本文件扩展名集合
|
||||||
|
*/
|
||||||
|
export const TEXT_FILE_EXTENSIONS = new Set([
|
||||||
|
".txt",
|
||||||
|
".md",
|
||||||
|
".markdown",
|
||||||
|
".csv",
|
||||||
|
".tsv",
|
||||||
|
".json",
|
||||||
|
".jsonl",
|
||||||
|
".ndjson",
|
||||||
|
".log",
|
||||||
|
".xml",
|
||||||
|
".yaml",
|
||||||
|
".yml",
|
||||||
|
".sql",
|
||||||
|
".js",
|
||||||
|
".ts",
|
||||||
|
".jsx",
|
||||||
|
".tsx",
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
".css",
|
||||||
|
".scss",
|
||||||
|
".less",
|
||||||
|
".py",
|
||||||
|
".java",
|
||||||
|
".c",
|
||||||
|
".cpp",
|
||||||
|
".h",
|
||||||
|
".hpp",
|
||||||
|
".go",
|
||||||
|
".rs",
|
||||||
|
".rb",
|
||||||
|
".php",
|
||||||
|
".sh",
|
||||||
|
".bash",
|
||||||
|
".zsh",
|
||||||
|
".ps1",
|
||||||
|
".bat",
|
||||||
|
".cmd",
|
||||||
|
".svg",
|
||||||
|
".rtf",
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断文件是否为文本文件(支持 UploadFile 类型)
|
||||||
|
* @param file UploadFile 对象
|
||||||
|
*/
|
||||||
|
export function isTextUploadFile(file: UploadFile): boolean {
|
||||||
|
const mimeType = (file.type || "").toLowerCase();
|
||||||
|
if (mimeType) {
|
||||||
|
if (mimeType.startsWith(TEXT_FILE_MIME_PREFIX)) return true;
|
||||||
|
if (TEXT_FILE_MIME_TYPES.has(mimeType)) return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const fileName = file.name || "";
|
||||||
|
const dotIndex = fileName.lastIndexOf(".");
|
||||||
|
if (dotIndex < 0) return false;
|
||||||
|
const ext = fileName.slice(dotIndex).toLowerCase();
|
||||||
|
return TEXT_FILE_EXTENSIONS.has(ext);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断文件名是否为文本文件
|
||||||
|
* @param fileName 文件名
|
||||||
|
*/
|
||||||
|
export function isTextFileByName(fileName: string): boolean {
|
||||||
|
const lowerName = fileName.toLowerCase();
|
||||||
|
|
||||||
|
// 先检查 MIME 类型(如果有)
|
||||||
|
// 这里简化处理,主要通过扩展名判断
|
||||||
|
|
||||||
|
const dotIndex = lowerName.lastIndexOf(".");
|
||||||
|
if (dotIndex < 0) return false;
|
||||||
|
const ext = lowerName.slice(dotIndex);
|
||||||
|
return TEXT_FILE_EXTENSIONS.has(ext);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取文件扩展名
|
||||||
|
* @param fileName 文件名
|
||||||
|
*/
|
||||||
|
export function getFileExtension(fileName: string): string {
|
||||||
|
const dotIndex = fileName.lastIndexOf(".");
|
||||||
|
if (dotIndex < 0) return "";
|
||||||
|
return fileName.slice(dotIndex).toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 安全地读取文件为文本
|
||||||
|
* @param file 文件对象
|
||||||
|
* @param encoding 编码,默认 UTF-8
|
||||||
|
*/
|
||||||
|
export function readFileAsText(
|
||||||
|
file: File | Blob,
|
||||||
|
encoding = "UTF-8"
|
||||||
|
): Promise<string> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onload = (e) => resolve(e.target?.result as string);
|
||||||
|
reader.onerror = () => reject(new Error("Failed to read file"));
|
||||||
|
reader.readAsText(file, encoding);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 流式分割文件并逐行上传
|
||||||
|
* 使用 Blob.slice 逐块读取,避免一次性加载大文件到内存
|
||||||
|
* @param file 文件对象
|
||||||
|
* @param datasetId 数据集ID
|
||||||
|
* @param uploadFn 上传函数,接收 FormData 和配置,返回 Promise
|
||||||
|
* @param onProgress 进度回调 (currentBytes, totalBytes, uploadedLines)
|
||||||
|
* @param chunkSize 每次读取的块大小,默认 1MB
|
||||||
|
* @param options 其他选项
|
||||||
|
* @returns 上传结果统计
|
||||||
|
*/
|
||||||
|
export interface StreamUploadOptions {
|
||||||
|
reqId: number;
|
||||||
|
fileNamePrefix?: string;
|
||||||
|
hasArchive?: boolean;
|
||||||
|
prefix?: string;
|
||||||
|
signal?: AbortSignal;
|
||||||
|
maxConcurrency?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface StreamUploadResult {
|
||||||
|
uploadedCount: number;
|
||||||
|
totalBytes: number;
|
||||||
|
skippedEmptyCount: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function streamSplitAndUpload(
|
||||||
|
file: File,
|
||||||
|
uploadFn: (formData: FormData, config?: { onUploadProgress?: (e: { loaded: number; total: number }) => void }) => Promise<unknown>,
|
||||||
|
onProgress?: (currentBytes: number, totalBytes: number, uploadedLines: number) => void,
|
||||||
|
chunkSize: number = 1024 * 1024, // 1MB
|
||||||
|
options: StreamUploadOptions
|
||||||
|
): Promise<StreamUploadResult> {
|
||||||
|
const { reqId, fileNamePrefix, prefix, signal, maxConcurrency = 3 } = options;
|
||||||
|
|
||||||
|
const fileSize = file.size;
|
||||||
|
let offset = 0;
|
||||||
|
let buffer = "";
|
||||||
|
let uploadedCount = 0;
|
||||||
|
let skippedEmptyCount = 0;
|
||||||
|
let currentBytes = 0;
|
||||||
|
|
||||||
|
// 获取文件名基础部分和扩展名
|
||||||
|
const originalFileName = fileNamePrefix || file.name;
|
||||||
|
const lastDotIndex = originalFileName.lastIndexOf(".");
|
||||||
|
const baseName = lastDotIndex > 0 ? originalFileName.slice(0, lastDotIndex) : originalFileName;
|
||||||
|
const fileExtension = lastDotIndex > 0 ? originalFileName.slice(lastDotIndex) : "";
|
||||||
|
|
||||||
|
// 收集所有需要上传的行
|
||||||
|
const pendingLines: { line: string; index: number }[] = [];
|
||||||
|
let lineIndex = 0;
|
||||||
|
|
||||||
|
// 逐块读取文件并收集行
|
||||||
|
while (offset < fileSize) {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
|
const end = Math.min(offset + chunkSize, fileSize);
|
||||||
|
const chunk = file.slice(offset, end);
|
||||||
|
const text = await readFileAsText(chunk);
|
||||||
|
|
||||||
|
// 将新读取的内容追加到 buffer
|
||||||
|
const combined = buffer + text;
|
||||||
|
|
||||||
|
// 按换行符分割(支持 \n 和 \r\n)
|
||||||
|
const lines = combined.split(/\r?\n/);
|
||||||
|
|
||||||
|
// 保留最后一行(可能不完整)
|
||||||
|
buffer = lines.pop() || "";
|
||||||
|
|
||||||
|
// 收集完整行
|
||||||
|
for (const line of lines) {
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
pendingLines.push({ line, index: lineIndex++ });
|
||||||
|
}
|
||||||
|
|
||||||
|
currentBytes = end;
|
||||||
|
offset = end;
|
||||||
|
|
||||||
|
// 每处理完一个 chunk,更新进度
|
||||||
|
onProgress?.(currentBytes, fileSize, uploadedCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 处理最后剩余的 buffer(如果文件不以换行符结尾)
|
||||||
|
if (buffer.trim()) {
|
||||||
|
pendingLines.push({ line: buffer, index: lineIndex++ });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 上传单行内容
|
||||||
|
* fileNo 固定为 1(因为所有行都属于同一个原始文件,只是不同的分片/行)
|
||||||
|
* chunkNo 用于标识是第几行
|
||||||
|
*/
|
||||||
|
async function uploadLine(line: string, index: number): Promise<void> {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!line.trim()) {
|
||||||
|
skippedEmptyCount++;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 保留原始文件扩展名
|
||||||
|
const newFileName = `${baseName}_${String(index + 1).padStart(6, "0")}${fileExtension}`;
|
||||||
|
const blob = new Blob([line], { type: "text/plain" });
|
||||||
|
const lineFile = new File([blob], newFileName, { type: "text/plain" });
|
||||||
|
|
||||||
|
// 计算分片(小文件通常只需要一个分片)
|
||||||
|
const slices = sliceFile(lineFile, DEFAULT_CHUNK_SIZE);
|
||||||
|
const checkSum = await calculateSHA256(slices[0]);
|
||||||
|
|
||||||
|
// 检查是否已取消(计算哈希后)
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append("file", slices[0]);
|
||||||
|
formData.append("reqId", reqId.toString());
|
||||||
|
// 所有行使用相同的 fileNo=1,因为它们属于同一个预上传请求
|
||||||
|
// chunkNo 表示这是第几行数据
|
||||||
|
formData.append("fileNo", "1");
|
||||||
|
formData.append("chunkNo", (index + 1).toString());
|
||||||
|
formData.append("fileName", newFileName);
|
||||||
|
formData.append("fileSize", lineFile.size.toString());
|
||||||
|
formData.append("totalChunkNum", "1");
|
||||||
|
formData.append("checkSumHex", checkSum);
|
||||||
|
if (prefix !== undefined) {
|
||||||
|
formData.append("prefix", prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
await uploadFn(formData, {
|
||||||
|
onUploadProgress: () => {
|
||||||
|
// 单行文件很小,进度主要用于追踪上传状态
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 带并发控制的上传队列执行器
|
||||||
|
* 使用任务队列模式,确保不会同时启动所有上传任务
|
||||||
|
*/
|
||||||
|
async function executeUploadsWithConcurrency(): Promise<void> {
|
||||||
|
const lines = [...pendingLines];
|
||||||
|
let currentIndex = 0;
|
||||||
|
let activeCount = 0;
|
||||||
|
let resolvedCount = 0;
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
function tryStartNext() {
|
||||||
|
// 检查是否已完成
|
||||||
|
if (resolvedCount >= lines.length) {
|
||||||
|
if (activeCount === 0) {
|
||||||
|
resolve();
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 启动新的上传任务,直到达到最大并发数
|
||||||
|
while (activeCount < maxConcurrency && currentIndex < lines.length) {
|
||||||
|
const { line, index } = lines[currentIndex++];
|
||||||
|
activeCount++;
|
||||||
|
|
||||||
|
uploadLine(line, index)
|
||||||
|
.then(() => {
|
||||||
|
uploadedCount++;
|
||||||
|
onProgress?.(fileSize, fileSize, uploadedCount);
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
reject(err);
|
||||||
|
})
|
||||||
|
.finally(() => {
|
||||||
|
activeCount--;
|
||||||
|
resolvedCount++;
|
||||||
|
// 尝试启动下一个任务
|
||||||
|
tryStartNext();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 开始执行
|
||||||
|
tryStartNext();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// 使用并发控制执行所有上传
|
||||||
|
await executeUploadsWithConcurrency();
|
||||||
|
|
||||||
|
return {
|
||||||
|
uploadedCount,
|
||||||
|
totalBytes: fileSize,
|
||||||
|
skippedEmptyCount,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断文件是否需要流式分割上传
|
||||||
|
* @param file 文件对象
|
||||||
|
* @param threshold 阈值,默认 5MB
|
||||||
|
*/
|
||||||
|
export function shouldStreamUpload(file: File, threshold: number = 5 * 1024 * 1024): boolean {
|
||||||
|
return file.size > threshold;
|
||||||
|
}
|
||||||
|
|||||||
@@ -92,6 +92,14 @@ class Request {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 监听 AbortSignal 来中止请求
|
||||||
|
if (config.signal) {
|
||||||
|
config.signal.addEventListener("abort", () => {
|
||||||
|
xhr.abort();
|
||||||
|
reject(new Error("上传已取消"));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// 监听上传进度
|
// 监听上传进度
|
||||||
xhr.upload.addEventListener("progress", function (event) {
|
xhr.upload.addEventListener("progress", function (event) {
|
||||||
if (event.lengthComputable) {
|
if (event.lengthComputable) {
|
||||||
|
|||||||
@@ -150,6 +150,18 @@ async def create_mapping(
|
|||||||
labeling_project, snapshot_file_ids
|
labeling_project, snapshot_file_ids
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 如果启用了分段且为文本数据集,预生成切片结构
|
||||||
|
if dataset_type == TEXT_DATASET_TYPE and request.segmentation_enabled:
|
||||||
|
try:
|
||||||
|
from ..service.editor import AnnotationEditorService
|
||||||
|
editor_service = AnnotationEditorService(db)
|
||||||
|
# 异步预计算切片(不阻塞创建响应)
|
||||||
|
segmentation_result = await editor_service.precompute_segmentation_for_project(labeling_project.id)
|
||||||
|
logger.info(f"Precomputed segmentation for project {labeling_project.id}: {segmentation_result}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to precompute segmentation for project {labeling_project.id}: {e}")
|
||||||
|
# 不影响项目创建,只记录警告
|
||||||
|
|
||||||
response_data = DatasetMappingCreateResponse(
|
response_data = DatasetMappingCreateResponse(
|
||||||
id=mapping.id,
|
id=mapping.id,
|
||||||
labeling_project_id=str(mapping.labeling_project_id),
|
labeling_project_id=str(mapping.labeling_project_id),
|
||||||
|
|||||||
@@ -1185,3 +1185,195 @@ class AnnotationEditorService:
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning("标注同步知识管理失败:%s", exc)
|
logger.warning("标注同步知识管理失败:%s", exc)
|
||||||
|
|
||||||
|
async def precompute_segmentation_for_project(
|
||||||
|
self,
|
||||||
|
project_id: str,
|
||||||
|
max_retries: int = 3
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
为指定项目的所有文本文件预计算切片结构并持久化到数据库
|
||||||
|
|
||||||
|
Args:
|
||||||
|
project_id: 标注项目ID
|
||||||
|
max_retries: 失败重试次数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
统计信息:{total_files, succeeded, failed}
|
||||||
|
"""
|
||||||
|
project = await self._get_project_or_404(project_id)
|
||||||
|
dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
|
||||||
|
|
||||||
|
# 只处理文本数据集
|
||||||
|
if dataset_type != DATASET_TYPE_TEXT:
|
||||||
|
logger.info(f"项目 {project_id} 不是文本数据集,跳过切片预生成")
|
||||||
|
return {"total_files": 0, "succeeded": 0, "failed": 0}
|
||||||
|
|
||||||
|
# 检查是否启用分段
|
||||||
|
if not self._resolve_segmentation_enabled(project):
|
||||||
|
logger.info(f"项目 {project_id} 未启用分段,跳过切片预生成")
|
||||||
|
return {"total_files": 0, "succeeded": 0, "failed": 0}
|
||||||
|
|
||||||
|
# 获取项目的所有文本文件(排除源文档)
|
||||||
|
files_result = await self.db.execute(
|
||||||
|
select(DatasetFiles)
|
||||||
|
.join(LabelingProjectFile, LabelingProjectFile.file_id == DatasetFiles.id)
|
||||||
|
.where(
|
||||||
|
LabelingProjectFile.project_id == project_id,
|
||||||
|
DatasetFiles.dataset_id == project.dataset_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
file_records = files_result.scalars().all()
|
||||||
|
|
||||||
|
if not file_records:
|
||||||
|
logger.info(f"项目 {project_id} 没有文件,跳过切片预生成")
|
||||||
|
return {"total_files": 0, "succeeded": 0, "failed": 0}
|
||||||
|
|
||||||
|
# 过滤源文档文件
|
||||||
|
valid_files = []
|
||||||
|
for file_record in file_records:
|
||||||
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
||||||
|
file_name = str(getattr(file_record, "file_name", "")).lower()
|
||||||
|
is_source_document = (
|
||||||
|
file_type in SOURCE_DOCUMENT_TYPES or
|
||||||
|
any(file_name.endswith(ext) for ext in SOURCE_DOCUMENT_EXTENSIONS)
|
||||||
|
)
|
||||||
|
if not is_source_document:
|
||||||
|
valid_files.append(file_record)
|
||||||
|
|
||||||
|
total_files = len(valid_files)
|
||||||
|
succeeded = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
label_config = await self._resolve_project_label_config(project)
|
||||||
|
primary_text_key = self._resolve_primary_text_key(label_config)
|
||||||
|
|
||||||
|
for file_record in valid_files:
|
||||||
|
file_id = str(file_record.id) # type: ignore
|
||||||
|
file_name = str(getattr(file_record, "file_name", ""))
|
||||||
|
|
||||||
|
for retry in range(max_retries):
|
||||||
|
try:
|
||||||
|
# 读取文本内容
|
||||||
|
text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
|
||||||
|
if not isinstance(text_content, str):
|
||||||
|
logger.warning(f"文件 {file_id} 内容不是字符串,跳过切片")
|
||||||
|
failed += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
# 解析文本记录
|
||||||
|
records: List[Tuple[Optional[Dict[str, Any]], str]] = []
|
||||||
|
if file_name.lower().endswith(JSONL_EXTENSION):
|
||||||
|
records = self._parse_jsonl_records(text_content)
|
||||||
|
else:
|
||||||
|
parsed_payload = self._try_parse_json_payload(text_content)
|
||||||
|
if parsed_payload:
|
||||||
|
records = [(parsed_payload, text_content)]
|
||||||
|
|
||||||
|
if not records:
|
||||||
|
records = [(None, text_content)]
|
||||||
|
|
||||||
|
record_texts = [
|
||||||
|
self._resolve_primary_text_value(payload, raw_text, primary_text_key)
|
||||||
|
for payload, raw_text in records
|
||||||
|
]
|
||||||
|
if not record_texts:
|
||||||
|
record_texts = [text_content]
|
||||||
|
|
||||||
|
# 判断是否需要分段
|
||||||
|
needs_segmentation = len(records) > 1 or any(
|
||||||
|
len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
|
||||||
|
)
|
||||||
|
|
||||||
|
if not needs_segmentation:
|
||||||
|
# 不需要分段的文件,跳过
|
||||||
|
succeeded += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
# 执行切片
|
||||||
|
splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
|
||||||
|
segment_cursor = 0
|
||||||
|
segments = {}
|
||||||
|
|
||||||
|
for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)):
|
||||||
|
normalized_text = record_text or ""
|
||||||
|
|
||||||
|
if len(normalized_text) > self.SEGMENT_THRESHOLD:
|
||||||
|
raw_segments = splitter.split(normalized_text)
|
||||||
|
for chunk_index, seg in enumerate(raw_segments):
|
||||||
|
segments[str(segment_cursor)] = {
|
||||||
|
SEGMENT_RESULT_KEY: [],
|
||||||
|
SEGMENT_CREATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
|
||||||
|
SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
|
||||||
|
}
|
||||||
|
segment_cursor += 1
|
||||||
|
else:
|
||||||
|
segments[str(segment_cursor)] = {
|
||||||
|
SEGMENT_RESULT_KEY: [],
|
||||||
|
SEGMENT_CREATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
|
||||||
|
SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
|
||||||
|
}
|
||||||
|
segment_cursor += 1
|
||||||
|
|
||||||
|
if not segments:
|
||||||
|
succeeded += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
# 构造分段标注结构
|
||||||
|
final_payload = {
|
||||||
|
SEGMENTED_KEY: True,
|
||||||
|
"version": 1,
|
||||||
|
SEGMENTS_KEY: segments,
|
||||||
|
SEGMENT_TOTAL_KEY: segment_cursor,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 检查是否已存在标注
|
||||||
|
existing_result = await self.db.execute(
|
||||||
|
select(AnnotationResult).where(
|
||||||
|
AnnotationResult.project_id == project_id,
|
||||||
|
AnnotationResult.file_id == file_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
existing = existing_result.scalar_one_or_none()
|
||||||
|
|
||||||
|
now = datetime.utcnow()
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
# 更新现有标注
|
||||||
|
existing.annotation = final_payload # type: ignore[assignment]
|
||||||
|
existing.annotation_status = ANNOTATION_STATUS_IN_PROGRESS # type: ignore[assignment]
|
||||||
|
existing.updated_at = now # type: ignore[assignment]
|
||||||
|
else:
|
||||||
|
# 创建新标注记录
|
||||||
|
record = AnnotationResult(
|
||||||
|
id=str(uuid.uuid4()),
|
||||||
|
project_id=project_id,
|
||||||
|
file_id=file_id,
|
||||||
|
annotation=final_payload,
|
||||||
|
annotation_status=ANNOTATION_STATUS_IN_PROGRESS,
|
||||||
|
created_at=now,
|
||||||
|
updated_at=now,
|
||||||
|
)
|
||||||
|
self.db.add(record)
|
||||||
|
|
||||||
|
await self.db.commit()
|
||||||
|
succeeded += 1
|
||||||
|
logger.info(f"成功为文件 {file_id} 预生成 {segment_cursor} 个切片")
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"为文件 {file_id} 预生成切片失败 (重试 {retry + 1}/{max_retries}): {e}"
|
||||||
|
)
|
||||||
|
if retry == max_retries - 1:
|
||||||
|
failed += 1
|
||||||
|
await self.db.rollback()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"项目 {project_id} 切片预生成完成: 总计 {total_files}, 成功 {succeeded}, 失败 {failed}"
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"total_files": total_files,
|
||||||
|
"succeeded": succeeded,
|
||||||
|
"failed": failed,
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
82
scripts/offline/Dockerfile.backend-python.offline-v2
Normal file
82
scripts/offline/Dockerfile.backend-python.offline-v2
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
# backend-python Dockerfile 离线版本 v2
|
||||||
|
FROM maven:3-eclipse-temurin-8 AS datax-builder
|
||||||
|
|
||||||
|
# 配置 Maven 阿里云镜像
|
||||||
|
RUN mkdir -p /root/.m2 && \
|
||||||
|
echo '<?xml version="1.0" encoding="UTF-8"?>\n\
|
||||||
|
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"\n\
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n\
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">\n\
|
||||||
|
<mirrors>\n\
|
||||||
|
<mirror>\n\
|
||||||
|
<id>aliyunmaven</id>\n\
|
||||||
|
<mirrorOf>*</mirrorOf>\n\
|
||||||
|
<name>阿里云公共仓库</name>\n\
|
||||||
|
<url>https://maven.aliyun.com/repository/public</url>\n\
|
||||||
|
</mirror>\n\
|
||||||
|
</mirrors>\n\
|
||||||
|
</settings>' > /root/.m2/settings.xml
|
||||||
|
|
||||||
|
# 离线模式: 从构建参数获取本地 DataX 路径
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG DATAX_LOCAL_PATH=${RESOURCES_DIR}/DataX
|
||||||
|
|
||||||
|
# 复制本地 DataX 源码
|
||||||
|
COPY ${DATAX_LOCAL_PATH} /DataX
|
||||||
|
|
||||||
|
COPY runtime/datax/ DataX/
|
||||||
|
|
||||||
|
RUN cd DataX && \
|
||||||
|
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
|
||||||
|
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
|
||||||
|
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
|
||||||
|
|
||||||
|
# 使用预装 APT 包的基础镜像
|
||||||
|
FROM datamate-python-base:latest
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
POETRY_VERSION=2.2.1 \
|
||||||
|
POETRY_NO_INTERACTION=1 \
|
||||||
|
POETRY_VIRTUALENVS_CREATE=false \
|
||||||
|
POETRY_CACHE_DIR=/tmp/poetry_cache
|
||||||
|
|
||||||
|
ENV JAVA_HOME=/usr/lib/jvm/java-21-openjdk
|
||||||
|
ENV PATH="/root/.local/bin:$JAVA_HOME/bin:$PATH"
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 配置 pip 阿里云镜像并安装 Poetry
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && \
|
||||||
|
pip config set global.trusted-host mirrors.aliyun.com && \
|
||||||
|
pip install --upgrade --root-user-action=ignore pip \
|
||||||
|
&& pip install --root-user-action=ignore pipx \
|
||||||
|
&& pipx install "poetry==$POETRY_VERSION"
|
||||||
|
|
||||||
|
COPY --from=datax-builder /DataX/target/datax/datax /opt/datax
|
||||||
|
RUN cp /opt/datax/plugin/reader/mysqlreader/libs/mysql* /opt/datax/plugin/reader/starrocksreader/libs/
|
||||||
|
|
||||||
|
# Copy only dependency files first
|
||||||
|
COPY runtime/datamate-python/pyproject.toml runtime/datamate-python/poetry.lock* /app/
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
|
||||||
|
poetry install --no-root --only main
|
||||||
|
|
||||||
|
# 离线模式: 使用本地 NLTK 数据
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG NLTK_DATA_LOCAL_PATH=${RESOURCES_DIR}/nltk_data
|
||||||
|
COPY ${NLTK_DATA_LOCAL_PATH} /usr/local/nltk_data
|
||||||
|
|
||||||
|
ENV NLTK_DATA=/usr/local/nltk_data
|
||||||
|
|
||||||
|
# Copy the rest of the application
|
||||||
|
COPY runtime/datamate-python /app
|
||||||
|
|
||||||
|
COPY runtime/datamate-python/deploy/docker-entrypoint.sh /docker-entrypoint.sh
|
||||||
|
RUN chmod +x /docker-entrypoint.sh || true
|
||||||
|
|
||||||
|
EXPOSE 18000
|
||||||
|
|
||||||
|
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||||
71
scripts/offline/Dockerfile.backend.offline
Normal file
71
scripts/offline/Dockerfile.backend.offline
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
# backend Dockerfile 离线版本
|
||||||
|
# 使用预装 APT 包的基础镜像
|
||||||
|
|
||||||
|
FROM maven:3-eclipse-temurin-21 AS builder
|
||||||
|
|
||||||
|
# 配置 Maven 阿里云镜像
|
||||||
|
RUN mkdir -p /root/.m2 && \
|
||||||
|
echo '<?xml version="1.0" encoding="UTF-8"?>\n\
|
||||||
|
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"\n\
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n\
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">\n\
|
||||||
|
<mirrors>\n\
|
||||||
|
<mirror>\n\
|
||||||
|
<id>aliyunmaven</id>\n\
|
||||||
|
<mirrorOf>*</mirrorOf>\n\
|
||||||
|
<name>阿里云公共仓库</name>\n\
|
||||||
|
<url>https://maven.aliyun.com/repository/public</url>\n\
|
||||||
|
</mirror>\n\
|
||||||
|
</mirrors>\n\
|
||||||
|
</settings>' > /root/.m2/settings.xml
|
||||||
|
|
||||||
|
WORKDIR /opt/backend
|
||||||
|
|
||||||
|
# 先复制所有 pom.xml 文件
|
||||||
|
COPY backend/pom.xml ./
|
||||||
|
COPY backend/services/pom.xml ./services/
|
||||||
|
COPY backend/shared/domain-common/pom.xml ./shared/domain-common/
|
||||||
|
COPY backend/shared/security-common/pom.xml ./shared/security-common/
|
||||||
|
COPY backend/services/data-annotation-service/pom.xml ./services/data-annotation-service/
|
||||||
|
COPY backend/services/data-cleaning-service/pom.xml ./services/data-cleaning-service/
|
||||||
|
COPY backend/services/data-evaluation-service/pom.xml ./services/data-evaluation-service/
|
||||||
|
COPY backend/services/data-management-service/pom.xml ./services/data-management-service/
|
||||||
|
COPY backend/services/data-synthesis-service/pom.xml ./services/data-synthesis-service/
|
||||||
|
COPY backend/services/execution-engine-service/pom.xml ./services/execution-engine-service/
|
||||||
|
COPY backend/services/main-application/pom.xml ./services/main-application/
|
||||||
|
COPY backend/services/operator-market-service/pom.xml ./services/operator-market-service/
|
||||||
|
COPY backend/services/pipeline-orchestration-service/pom.xml ./services/pipeline-orchestration-service/
|
||||||
|
COPY backend/services/rag-indexer-service/pom.xml ./services/rag-indexer-service/
|
||||||
|
COPY backend/services/rag-query-service/pom.xml ./services/rag-query-service/
|
||||||
|
|
||||||
|
# 使用缓存卷下载依赖
|
||||||
|
RUN --mount=type=cache,target=/root/.m2/repository \
|
||||||
|
cd /opt/backend/services && \
|
||||||
|
mvn dependency:go-offline -Dmaven.test.skip=true || true
|
||||||
|
|
||||||
|
# 复制所有源代码
|
||||||
|
COPY backend/ /opt/backend
|
||||||
|
|
||||||
|
# 编译打包
|
||||||
|
RUN --mount=type=cache,target=/root/.m2/repository \
|
||||||
|
cd /opt/backend/services && \
|
||||||
|
mvn clean package -Dmaven.test.skip=true
|
||||||
|
|
||||||
|
# 使用预装 APT 包的基础镜像
|
||||||
|
FROM datamate-java-base:latest
|
||||||
|
|
||||||
|
# 不再执行 apt-get update,因为基础镜像已经预装了所有需要的包
|
||||||
|
# 如果需要添加额外的包,可以在这里添加,但离线环境下会失败
|
||||||
|
|
||||||
|
COPY --from=builder /opt/backend/services/main-application/target/datamate.jar /opt/backend/datamate.jar
|
||||||
|
COPY scripts/images/backend/start.sh /opt/backend/start.sh
|
||||||
|
COPY runtime/ops/examples/test_operator/test_operator.tar /opt/backend/test_operator.tar
|
||||||
|
|
||||||
|
RUN dos2unix /opt/backend/start.sh \
|
||||||
|
&& chmod +x /opt/backend/start.sh \
|
||||||
|
&& ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||||
|
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/backend/start.sh"]
|
||||||
|
CMD ["java", "-Duser.timezone=Asia/Shanghai", "-jar", "/opt/backend/datamate.jar"]
|
||||||
62
scripts/offline/Dockerfile.base-images
Normal file
62
scripts/offline/Dockerfile.base-images
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# 预安装 APT 包的基础镜像
|
||||||
|
# 在有网环境构建这些镜像,在无网环境作为基础镜像使用
|
||||||
|
|
||||||
|
# ==================== backend / gateway 基础镜像 ====================
|
||||||
|
FROM eclipse-temurin:21-jdk AS datamate-java-base
|
||||||
|
|
||||||
|
# 配置 apt 阿里云镜像源
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list.d/ubuntu.sources ]; then \
|
||||||
|
sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g; s/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list.d/ubuntu.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g; s/archive.ubuntu.com/mirrors.aliyun.com/g; s/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y vim wget curl rsync python3 python3-pip python-is-python3 dos2unix libreoffice fonts-noto-cjk && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# ==================== backend-python 基础镜像 ====================
|
||||||
|
FROM python:3.12-slim AS datamate-python-base
|
||||||
|
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends vim openjdk-21-jre nfs-common glusterfs-client rsync && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# ==================== runtime 基础镜像 ====================
|
||||||
|
FROM ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm AS datamate-runtime-base
|
||||||
|
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt update && \
|
||||||
|
apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix swig poppler-utils tesseract-ocr && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# ==================== deer-flow-backend 基础镜像 ====================
|
||||||
|
FROM ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm AS deer-flow-backend-base
|
||||||
|
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt-get update && apt-get install -y libpq-dev git && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# ==================== mineru 基础镜像 ====================
|
||||||
|
FROM python:3.11-slim AS mineru-base
|
||||||
|
|
||||||
|
RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y curl vim libgl1 libglx0 libopengl0 libglib2.0-0 procps && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
47
scripts/offline/Dockerfile.gateway.offline
Normal file
47
scripts/offline/Dockerfile.gateway.offline
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# gateway Dockerfile 离线版本
|
||||||
|
FROM maven:3-eclipse-temurin-21 AS builder
|
||||||
|
|
||||||
|
# 配置 Maven 阿里云镜像
|
||||||
|
RUN mkdir -p /root/.m2 && \
|
||||||
|
echo '<?xml version="1.0" encoding="UTF-8"?>\n\
|
||||||
|
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"\n\
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n\
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">\n\
|
||||||
|
<mirrors>\n\
|
||||||
|
<mirror>\n\
|
||||||
|
<id>aliyunmaven</id>\n\
|
||||||
|
<mirrorOf>*</mirrorOf>\n\
|
||||||
|
<name>阿里云公共仓库</name>\n\
|
||||||
|
<url>https://maven.aliyun.com/repository/public</url>\n\
|
||||||
|
</mirror>\n\
|
||||||
|
</mirrors>\n\
|
||||||
|
</settings>' > /root/.m2/settings.xml
|
||||||
|
|
||||||
|
WORKDIR /opt/gateway
|
||||||
|
|
||||||
|
COPY backend/pom.xml ./
|
||||||
|
COPY backend/api-gateway/pom.xml ./api-gateway/
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.m2/repository \
|
||||||
|
cd /opt/gateway/api-gateway && \
|
||||||
|
mvn dependency:go-offline -Dmaven.test.skip=true || true
|
||||||
|
|
||||||
|
COPY backend/api-gateway /opt/gateway/api-gateway
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.m2/repository \
|
||||||
|
cd /opt/gateway/api-gateway && \
|
||||||
|
mvn clean package -Dmaven.test.skip=true
|
||||||
|
|
||||||
|
FROM datamate-java-base:latest
|
||||||
|
|
||||||
|
COPY --from=builder /opt/gateway/api-gateway/target/gateway.jar /opt/gateway/gateway.jar
|
||||||
|
COPY scripts/images/gateway/start.sh /opt/gateway/start.sh
|
||||||
|
|
||||||
|
RUN dos2unix /opt/gateway/start.sh \
|
||||||
|
&& chmod +x /opt/gateway/start.sh \
|
||||||
|
&& ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||||
|
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/gateway/start.sh"]
|
||||||
|
CMD ["java", "-Duser.timezone=Asia/Shanghai", "-jar", "/opt/gateway/gateway.jar"]
|
||||||
42
scripts/offline/Dockerfile.runtime.offline-v2
Normal file
42
scripts/offline/Dockerfile.runtime.offline-v2
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
# runtime Dockerfile 离线版本 v2
|
||||||
|
# 使用预装 APT 包的基础镜像
|
||||||
|
|
||||||
|
FROM datamate-runtime-base:latest
|
||||||
|
|
||||||
|
# 离线模式: 本地模型文件路径
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG MODELS_DIR=${RESOURCES_DIR}/models
|
||||||
|
|
||||||
|
# 复制本地 PaddleOCR 模型
|
||||||
|
RUN mkdir -p /home/models
|
||||||
|
COPY ${MODELS_DIR}/ch_ppocr_mobile_v2.0_cls_infer.tar /home/models/
|
||||||
|
RUN tar -xf /home/models/ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models
|
||||||
|
|
||||||
|
COPY runtime/python-executor /opt/runtime
|
||||||
|
COPY runtime/ops /opt/runtime/datamate/ops
|
||||||
|
COPY runtime/ops/user /opt/runtime/user
|
||||||
|
COPY scripts/images/runtime/start.sh /opt/runtime/start.sh
|
||||||
|
|
||||||
|
ENV PYTHONPATH=/opt/runtime/datamate/
|
||||||
|
ENV UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
ENV UV_INDEX_STRATEGY=unsafe-best-match
|
||||||
|
ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/"
|
||||||
|
|
||||||
|
WORKDIR /opt/runtime
|
||||||
|
|
||||||
|
# 复制本地 spaCy 模型
|
||||||
|
COPY ${MODELS_DIR}/zh_core_web_sm-3.8.0-py3-none-any.whl /tmp/
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv pip install -e .[all] --system \
|
||||||
|
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
|
||||||
|
&& uv pip install /tmp/zh_core_web_sm-3.8.0-py3-none-any.whl --system \
|
||||||
|
&& echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth
|
||||||
|
|
||||||
|
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||||
|
&& chmod +x /opt/runtime/start.sh \
|
||||||
|
&& dos2unix /opt/runtime/start.sh
|
||||||
|
|
||||||
|
EXPOSE 8081
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/runtime/start.sh"]
|
||||||
@@ -74,20 +74,29 @@ scp build-cache-20250202.tar.gz user@offline-server:/opt/datamate/
|
|||||||
# 或者使用 U 盘等物理介质
|
# 或者使用 U 盘等物理介质
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4. 无网环境构建
|
#### 4. 无网环境构建(推荐使用传统方式)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 解压缓存
|
# 解压缓存
|
||||||
tar -xzf build-cache-20250202.tar.gz
|
tar -xzf build-cache-20250202.tar.gz
|
||||||
|
|
||||||
# 设置环境并构建
|
# 诊断环境(检查基础镜像等)
|
||||||
|
make offline-diagnose
|
||||||
|
|
||||||
|
# 方法 A:传统 docker build(推荐,更稳定)
|
||||||
|
make offline-setup
|
||||||
|
make offline-build-classic
|
||||||
|
|
||||||
|
# 方法 B:BuildKit 构建(如果方法 A 失败)
|
||||||
make offline-setup
|
make offline-setup
|
||||||
make offline-build
|
make offline-build
|
||||||
|
|
||||||
# 或者指定版本号
|
# 或者指定版本号
|
||||||
make offline-build OFFLINE_VERSION=v1.0.0
|
make offline-build-classic OFFLINE_VERSION=v1.0.0
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**⚠️ 重要提示**:如果遇到镜像拉取问题,请使用 `make offline-build-classic` 而不是 `make offline-build`。
|
||||||
|
|
||||||
### 方法二:使用独立脚本
|
### 方法二:使用独立脚本
|
||||||
|
|
||||||
#### 导出缓存
|
#### 导出缓存
|
||||||
@@ -162,9 +171,133 @@ tar -czf build-cache-partial.tar.gz build-cache/buildkit/backend-cache
|
|||||||
make backend-offline-build
|
make backend-offline-build
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## APT 缓存问题详解
|
||||||
|
|
||||||
|
### 问题描述
|
||||||
|
|
||||||
|
即使使用了 `--mount=type=cache,target=/var/cache/apt`,Dockerfile 中的 `apt-get update` 仍会尝试从网络获取包列表(list 数据),导致无网环境下构建失败:
|
||||||
|
|
||||||
|
```
|
||||||
|
Err:1 http://mirrors.aliyun.com/debian bookworm InRelease
|
||||||
|
Could not resolve 'mirrors.aliyun.com'
|
||||||
|
Reading package lists...
|
||||||
|
E: Failed to fetch http://mirrors.aliyun.com/debian/dists/bookworm/InRelease
|
||||||
|
```
|
||||||
|
|
||||||
|
### 根本原因
|
||||||
|
|
||||||
|
- `--mount=type=cache,target=/var/cache/apt` 只缓存下载的 `.deb` 包
|
||||||
|
- `apt-get update` 会尝试从配置的源获取最新的包索引(InRelease/Packages 文件)
|
||||||
|
- `/var/lib/apt/lists/` 目录存储包索引,但通常不在缓存范围内
|
||||||
|
|
||||||
|
### 解决方案
|
||||||
|
|
||||||
|
#### 方案 1: 使用预装 APT 包的基础镜像(推荐)
|
||||||
|
|
||||||
|
这是最有效的方法:
|
||||||
|
|
||||||
|
**步骤 1**: 在有网环境构建预装所有依赖的基础镜像
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 构建并保存带 APT 预装包的基础镜像
|
||||||
|
./scripts/offline/build-base-images.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
这会创建以下预装基础镜像:
|
||||||
|
- `datamate-java-base` - 用于 backend、gateway(预装 vim、python3、libreoffice 等)
|
||||||
|
- `datamate-python-base` - 用于 backend-python(预装 openjdk、nfs-common 等)
|
||||||
|
- `datamate-runtime-base` - 用于 runtime(预装 libgl1、tesseract-ocr 等)
|
||||||
|
- `deer-flow-backend-base` - 用于 deer-flow-backend
|
||||||
|
- `mineru-base` - 用于 mineru
|
||||||
|
|
||||||
|
**步骤 2**: 在无网环境使用这些基础镜像构建
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 加载包含预装基础镜像的 tar 包
|
||||||
|
docker load -i build-cache/images/base-images-with-apt.tar
|
||||||
|
|
||||||
|
# 使用最终版构建脚本
|
||||||
|
./scripts/offline/build-offline-final.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 方案 2: 修改 Dockerfile 跳过 apt update
|
||||||
|
|
||||||
|
如果确定不需要安装新包,可以修改 Dockerfile:
|
||||||
|
|
||||||
|
```dockerfile
|
||||||
|
# 原代码
|
||||||
|
RUN apt-get update && apt-get install -y xxx
|
||||||
|
|
||||||
|
# 修改为(离线环境)
|
||||||
|
# RUN apt-get update && \
|
||||||
|
RUN apt-get install -y xxx || true
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 方案 3: 挂载 apt lists 缓存
|
||||||
|
|
||||||
|
在有网环境预先下载并保存 apt lists:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 有网环境:保存 apt lists
|
||||||
|
docker run --rm \
|
||||||
|
-v "$(pwd)/apt-lists:/var/lib/apt/lists" \
|
||||||
|
eclipse-temurin:21-jdk \
|
||||||
|
apt-get update
|
||||||
|
|
||||||
|
# 无网环境:挂载保存的 lists
|
||||||
|
docker build \
|
||||||
|
--mount=type=bind,source=$(pwd)/apt-lists,target=/var/lib/apt/lists,ro \
|
||||||
|
-f Dockerfile .
|
||||||
|
```
|
||||||
|
|
||||||
|
**注意**: BuildKit 的 `--mount=type=bind` 在 `docker build` 中不直接支持,需要在 Dockerfile 中使用。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## 故障排查
|
## 故障排查
|
||||||
|
|
||||||
### 问题 1: 缓存导入失败
|
### 问题 1: 构建时仍然尝试拉取镜像(最常见)
|
||||||
|
|
||||||
|
**现象**:
|
||||||
|
```
|
||||||
|
ERROR: failed to solve: pulling from host ...
|
||||||
|
或
|
||||||
|
ERROR: pull access denied, repository does not exist or may require authorization
|
||||||
|
```
|
||||||
|
|
||||||
|
**原因**:
|
||||||
|
- 基础镜像未正确加载
|
||||||
|
- BuildKit 尝试验证远程镜像
|
||||||
|
|
||||||
|
**解决方案**:
|
||||||
|
|
||||||
|
1. **使用传统构建方式(推荐)**:
|
||||||
|
```bash
|
||||||
|
make offline-build-classic
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **手动加载基础镜像**:
|
||||||
|
```bash
|
||||||
|
# 加载基础镜像
|
||||||
|
docker load -i build-cache/images/base-images.tar
|
||||||
|
|
||||||
|
# 验证镜像存在
|
||||||
|
docker images | grep -E "(maven|eclipse-temurin|mysql|node|nginx)"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **使用 Docker 守护进程的离线模式**:
|
||||||
|
```bash
|
||||||
|
# 编辑 /etc/docker/daemon.json
|
||||||
|
{
|
||||||
|
"registry-mirrors": [],
|
||||||
|
"insecure-registries": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# 重启 Docker
|
||||||
|
sudo systemctl restart docker
|
||||||
|
```
|
||||||
|
|
||||||
|
### 问题 2: 缓存导入失败
|
||||||
|
|
||||||
```
|
```
|
||||||
ERROR: failed to solve: failed to read cache metadata
|
ERROR: failed to solve: failed to read cache metadata
|
||||||
@@ -172,23 +305,26 @@ ERROR: failed to solve: failed to read cache metadata
|
|||||||
|
|
||||||
**解决**: 缓存目录可能损坏,重新在有网环境导出。
|
**解决**: 缓存目录可能损坏,重新在有网环境导出。
|
||||||
|
|
||||||
### 问题 2: 基础镜像不存在
|
### 问题 3: 基础镜像不存在
|
||||||
|
|
||||||
```
|
```
|
||||||
ERROR: pull access denied
|
ERROR: pull access denied
|
||||||
```
|
```
|
||||||
|
|
||||||
**解决**: 先执行 `make offline-setup` 加载基础镜像。
|
**解决**:
|
||||||
|
1. 先执行 `make offline-setup` 加载基础镜像
|
||||||
|
2. 运行 `make offline-diagnose` 检查缺失的镜像
|
||||||
|
3. 重新导出缓存时确保包含所有基础镜像
|
||||||
|
|
||||||
### 问题 3: 网络连接错误(无网环境)
|
### 问题 4: 网络连接错误(无网环境)
|
||||||
|
|
||||||
```
|
```
|
||||||
ERROR: failed to do request: dial tcp: lookup ...
|
ERROR: failed to do request: dial tcp: lookup ...
|
||||||
```
|
```
|
||||||
|
|
||||||
**解决**: 检查 Dockerfile 中是否还有网络依赖,可能需要修改 Dockerfile 使用本地资源。
|
**解决**: 检查 Dockerfile 中是否还有网络依赖(如 `git clone`、`wget`、`pip install` 等),可能需要修改 Dockerfile 使用本地资源。
|
||||||
|
|
||||||
### 问题 4: 内存不足
|
### 问题 5: 内存不足
|
||||||
|
|
||||||
BuildKit 缓存可能占用大量内存,可以设置资源限制:
|
BuildKit 缓存可能占用大量内存,可以设置资源限制:
|
||||||
|
|
||||||
@@ -200,6 +336,39 @@ docker buildx create --name offline-builder \
|
|||||||
--use
|
--use
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 问题 6: BuildKit 构建器无法使用本地镜像
|
||||||
|
|
||||||
|
**现象**: 即使镜像已加载,BuildKit 仍提示找不到镜像
|
||||||
|
|
||||||
|
**解决**: BuildKit 的 `docker-container` 驱动无法直接访问本地镜像。使用以下方法之一:
|
||||||
|
|
||||||
|
**方法 A**: 使用传统 Docker 构建(推荐)
|
||||||
|
```bash
|
||||||
|
make offline-build-classic
|
||||||
|
```
|
||||||
|
|
||||||
|
**方法 B**: 将镜像推送到本地 registry
|
||||||
|
```bash
|
||||||
|
# 启动本地 registry
|
||||||
|
docker run -d -p 5000:5000 --name registry registry:2
|
||||||
|
|
||||||
|
# 标记并推送镜像到本地 registry
|
||||||
|
docker tag maven:3-eclipse-temurin-21 localhost:5000/maven:3-eclipse-temurin-21
|
||||||
|
docker push localhost:5000/maven:3-eclipse-temurin-21
|
||||||
|
|
||||||
|
# 修改 Dockerfile 使用本地 registry
|
||||||
|
# FROM localhost:5000/maven:3-eclipse-temurin-21
|
||||||
|
```
|
||||||
|
|
||||||
|
**方法 C**: 使用 `docker` 驱动的 buildx 构建器(不需要推送镜像,但有其他限制)
|
||||||
|
```bash
|
||||||
|
# 创建使用 docker 驱动的构建器
|
||||||
|
docker buildx create --name offline-builder --driver docker --use
|
||||||
|
|
||||||
|
# 但这种方式无法使用 --cache-from type=local
|
||||||
|
# 仅适用于简单的离线构建场景
|
||||||
|
```
|
||||||
|
|
||||||
## 限制说明
|
## 限制说明
|
||||||
|
|
||||||
1. **镜像版本**: 基础镜像版本必须与缓存导出时一致
|
1. **镜像版本**: 基础镜像版本必须与缓存导出时一致
|
||||||
@@ -239,6 +408,81 @@ docker buildx build \
|
|||||||
-f scripts/images/backend/Dockerfile .
|
-f scripts/images/backend/Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 文件清单
|
||||||
|
|
||||||
|
```
|
||||||
|
scripts/offline/
|
||||||
|
├── export-cache.sh # 有网环境导出缓存脚本
|
||||||
|
├── build-base-images.sh # 构建 APT 预装基础镜像
|
||||||
|
├── build-offline.sh # 基础离线构建脚本(BuildKit)
|
||||||
|
├── build-offline-v2.sh # 增强版离线构建脚本
|
||||||
|
├── build-offline-classic.sh # 传统 docker build 脚本
|
||||||
|
├── build-offline-final.sh # 最终版(使用预装基础镜像,推荐)
|
||||||
|
├── diagnose.sh # 环境诊断脚本
|
||||||
|
├── Dockerfile.base-images # 预装 APT 包的基础镜像定义
|
||||||
|
├── Dockerfile.backend.offline # backend 离线 Dockerfile(使用预装基础镜像)
|
||||||
|
├── Dockerfile.gateway.offline # gateway 离线 Dockerfile(使用预装基础镜像)
|
||||||
|
├── Dockerfile.backend-python.offline # backend-python 离线 Dockerfile
|
||||||
|
├── Dockerfile.backend-python.offline-v2 # backend-python 离线 Dockerfile v2(使用预装基础镜像)
|
||||||
|
├── Dockerfile.runtime.offline # runtime 离线 Dockerfile
|
||||||
|
├── Dockerfile.runtime.offline-v2 # runtime 离线 Dockerfile v2(使用预装基础镜像)
|
||||||
|
├── Dockerfile.deer-flow-backend.offline # deer-flow-backend 离线 Dockerfile
|
||||||
|
├── Dockerfile.deer-flow-frontend.offline # deer-flow-frontend 离线 Dockerfile
|
||||||
|
├── Makefile.offline # 独立离线构建 Makefile
|
||||||
|
└── README.md # 本文档
|
||||||
|
|
||||||
|
Makefile.offline.mk # Makefile 扩展(追加到主 Makefile)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 推荐工作流(解决 APT 问题版)
|
||||||
|
|
||||||
|
### 工作流 A: 使用预装 APT 包的基础镜像(彻底解决 APT 问题)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ========== 有网环境 ==========
|
||||||
|
|
||||||
|
# 1. 构建并保存带 APT 预装包的基础镜像
|
||||||
|
./scripts/offline/build-base-images.sh
|
||||||
|
# 输出: build-cache/images/base-images-with-apt.tar
|
||||||
|
|
||||||
|
# 2. 导出其他缓存(BuildKit 缓存、外部资源)
|
||||||
|
./scripts/offline/export-cache.sh
|
||||||
|
|
||||||
|
# 3. 打包传输
|
||||||
|
scp build-cache/images/base-images-with-apt.tar user@offline-server:/opt/datamate/build-cache/images/
|
||||||
|
scp build-cache-*.tar.gz user@offline-server:/opt/datamate/
|
||||||
|
|
||||||
|
# ========== 无网环境 ==========
|
||||||
|
|
||||||
|
cd /opt/datamate
|
||||||
|
|
||||||
|
# 4. 解压
|
||||||
|
tar -xzf build-cache-*.tar.gz
|
||||||
|
|
||||||
|
# 5. 加载预装基础镜像(关键!)
|
||||||
|
docker load -i build-cache/images/base-images-with-apt.tar
|
||||||
|
|
||||||
|
# 6. 使用最终版脚本构建
|
||||||
|
./scripts/offline/build-offline-final.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### 工作流 B: 简单场景(使用传统构建)
|
||||||
|
|
||||||
|
如果 APT 包需求简单,可以直接使用传统构建:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 有网环境
|
||||||
|
make offline-export
|
||||||
|
|
||||||
|
# 传输到无网环境
|
||||||
|
scp build-cache-*.tar.gz offline-server:/path/
|
||||||
|
|
||||||
|
# 无网环境
|
||||||
|
tar -xzf build-cache-*.tar.gz
|
||||||
|
make offline-diagnose # 检查环境
|
||||||
|
make offline-build-classic # 使用传统构建
|
||||||
|
```
|
||||||
|
|
||||||
## 参考
|
## 参考
|
||||||
|
|
||||||
- [Docker BuildKit Documentation](https://docs.docker.com/build/buildkit/)
|
- [Docker BuildKit Documentation](https://docs.docker.com/build/buildkit/)
|
||||||
|
|||||||
87
scripts/offline/build-base-images.sh
Normal file
87
scripts/offline/build-base-images.sh
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# 构建带有预装 APT 包的基础镜像
|
||||||
|
# Usage: ./build-base-images.sh [output-dir]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
OUTPUT_DIR="${1:-./build-cache}"
|
||||||
|
IMAGES_DIR="$OUTPUT_DIR/images"
|
||||||
|
|
||||||
|
mkdir -p "$IMAGES_DIR"
|
||||||
|
|
||||||
|
echo "======================================"
|
||||||
|
echo "构建预装 APT 包的基础镜像"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
# 构建各个基础镜像
|
||||||
|
echo ""
|
||||||
|
echo "1. 构建 datamate-java-base (用于 backend, gateway)..."
|
||||||
|
docker build \
|
||||||
|
-t datamate-java-base:latest \
|
||||||
|
--target datamate-java-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: datamate-java-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "2. 构建 datamate-python-base (用于 backend-python)..."
|
||||||
|
docker build \
|
||||||
|
-t datamate-python-base:latest \
|
||||||
|
--target datamate-python-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: datamate-python-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "3. 构建 datamate-runtime-base (用于 runtime)..."
|
||||||
|
docker build \
|
||||||
|
-t datamate-runtime-base:latest \
|
||||||
|
--target datamate-runtime-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: datamate-runtime-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "4. 构建 deer-flow-backend-base (用于 deer-flow-backend)..."
|
||||||
|
docker build \
|
||||||
|
-t deer-flow-backend-base:latest \
|
||||||
|
--target deer-flow-backend-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: deer-flow-backend-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "5. 构建 mineru-base (用于 mineru)..."
|
||||||
|
docker build \
|
||||||
|
-t mineru-base:latest \
|
||||||
|
--target mineru-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: mineru-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "保存基础镜像集合"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
docker save -o "$IMAGES_DIR/base-images-with-apt.tar" \
|
||||||
|
maven:3-eclipse-temurin-21 \
|
||||||
|
maven:3-eclipse-temurin-8 \
|
||||||
|
eclipse-temurin:21-jdk \
|
||||||
|
mysql:8 \
|
||||||
|
node:20-alpine \
|
||||||
|
nginx:1.29 \
|
||||||
|
ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm \
|
||||||
|
ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm \
|
||||||
|
ghcr.nju.edu.cn/astral-sh/uv:latest \
|
||||||
|
python:3.12-slim \
|
||||||
|
python:3.11-slim \
|
||||||
|
gcr.io/distroless/nodejs20-debian12 \
|
||||||
|
datamate-java-base:latest \
|
||||||
|
datamate-python-base:latest \
|
||||||
|
datamate-runtime-base:latest \
|
||||||
|
deer-flow-backend-base:latest \
|
||||||
|
mineru-base:latest \
|
||||||
|
2>/dev/null || echo "Warning: 部分镜像保存失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "✓ 基础镜像构建完成"
|
||||||
|
echo "======================================"
|
||||||
|
echo "镜像列表:"
|
||||||
|
docker images | grep -E "(datamate-|deer-flow-|mineru-)base" || true
|
||||||
181
scripts/offline/build-offline-final.sh
Normal file
181
scripts/offline/build-offline-final.sh
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# 最终版离线构建脚本 - 使用预装 APT 包的基础镜像
|
||||||
|
# Usage: ./build-offline-final.sh [cache-dir] [version]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CACHE_DIR="${1:-./build-cache}"
|
||||||
|
VERSION="${2:-latest}"
|
||||||
|
IMAGES_DIR="$CACHE_DIR/images"
|
||||||
|
RESOURCES_DIR="$CACHE_DIR/resources"
|
||||||
|
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
|
||||||
|
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
||||||
|
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||||
|
|
||||||
|
# 检查缓存目录
|
||||||
|
if [ ! -d "$CACHE_DIR" ]; then
|
||||||
|
log_error "缓存目录 $CACHE_DIR 不存在"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 加载基础镜像
|
||||||
|
load_images() {
|
||||||
|
log_info "加载基础镜像..."
|
||||||
|
|
||||||
|
# 优先加载带 APT 预装包的镜像集合
|
||||||
|
if [ -f "$IMAGES_DIR/base-images-with-apt.tar" ]; then
|
||||||
|
log_info "加载带 APT 预装包的基础镜像..."
|
||||||
|
docker load -i "$IMAGES_DIR/base-images-with-apt.tar"
|
||||||
|
elif [ -f "$IMAGES_DIR/base-images.tar" ]; then
|
||||||
|
log_warn "加载普通基础镜像(不含 APT 预装包)..."
|
||||||
|
docker load -i "$IMAGES_DIR/base-images.tar"
|
||||||
|
else
|
||||||
|
log_warn "基础镜像 tar 包不存在,检查本地镜像..."
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "✓ 镜像加载完成"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 验证镜像是否存在
|
||||||
|
verify_image() {
|
||||||
|
docker inspect "$1" > /dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
# 构建函数
|
||||||
|
build_service() {
|
||||||
|
local service_name=$1
|
||||||
|
local image_name=$2
|
||||||
|
local dockerfile=$3
|
||||||
|
local base_image=$4 # 必需的基础镜像
|
||||||
|
|
||||||
|
log_info "----------------------------------------"
|
||||||
|
log_info "构建 $service_name"
|
||||||
|
log_info "----------------------------------------"
|
||||||
|
|
||||||
|
if [ ! -f "$dockerfile" ]; then
|
||||||
|
log_error "Dockerfile 不存在: $dockerfile"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 检查必需的基础镜像
|
||||||
|
if [ -n "$base_image" ]; then
|
||||||
|
if verify_image "$base_image"; then
|
||||||
|
log_info "✓ 基础镜像存在: $base_image"
|
||||||
|
else
|
||||||
|
log_error "✗ 缺少基础镜像: $base_image"
|
||||||
|
log_info "请确保已加载正确的 base-images-with-apt.tar"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 准备构建参数
|
||||||
|
local build_args=()
|
||||||
|
|
||||||
|
# 添加资源目录参数
|
||||||
|
if [ -d "$RESOURCES_DIR" ]; then
|
||||||
|
build_args+=("--build-arg" "RESOURCES_DIR=$RESOURCES_DIR")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 执行构建
|
||||||
|
log_info "开始构建..."
|
||||||
|
if docker build \
|
||||||
|
--pull=false \
|
||||||
|
"${build_args[@]}" \
|
||||||
|
-f "$dockerfile" \
|
||||||
|
-t "$image_name:$VERSION" \
|
||||||
|
. 2>&1; then
|
||||||
|
log_info "✓ $service_name 构建成功"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
log_error "✗ $service_name 构建失败"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# 主流程
|
||||||
|
main() {
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "最终版离线构建 (使用 APT 预装基础镜像)"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
# 加载基础镜像
|
||||||
|
load_images
|
||||||
|
|
||||||
|
# 验证关键基础镜像
|
||||||
|
log_info ""
|
||||||
|
log_info "验证预装基础镜像..."
|
||||||
|
REQUIRED_BASE_IMAGES=(
|
||||||
|
"datamate-java-base:latest"
|
||||||
|
"datamate-python-base:latest"
|
||||||
|
"datamate-runtime-base:latest"
|
||||||
|
)
|
||||||
|
|
||||||
|
for img in "${REQUIRED_BASE_IMAGES[@]}"; do
|
||||||
|
if verify_image "$img"; then
|
||||||
|
log_info " ✓ $img"
|
||||||
|
else
|
||||||
|
log_warn " ✗ $img (缺失)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# 定义服务配置
|
||||||
|
declare -A SERVICES=(
|
||||||
|
["database"]="datamate-database:scripts/images/database/Dockerfile:"
|
||||||
|
["gateway"]="datamate-gateway:scripts/offline/Dockerfile.gateway.offline:datamate-java-base:latest"
|
||||||
|
["backend"]="datamate-backend:scripts/offline/Dockerfile.backend.offline:datamate-java-base:latest"
|
||||||
|
["frontend"]="datamate-frontend:scripts/images/frontend/Dockerfile:"
|
||||||
|
["runtime"]="datamate-runtime:scripts/offline/Dockerfile.runtime.offline-v2:datamate-runtime-base:latest"
|
||||||
|
["backend-python"]="datamate-backend-python:scripts/offline/Dockerfile.backend-python.offline-v2:datamate-python-base:latest"
|
||||||
|
)
|
||||||
|
|
||||||
|
log_info ""
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "开始构建服务"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
local failed=()
|
||||||
|
local succeeded=()
|
||||||
|
|
||||||
|
for service_name in "${!SERVICES[@]}"; do
|
||||||
|
IFS=':' read -r image_name dockerfile base_image <<< "${SERVICES[$service_name]}"
|
||||||
|
if build_service "$service_name" "$image_name" "$dockerfile" "$base_image"; then
|
||||||
|
succeeded+=("$service_name")
|
||||||
|
else
|
||||||
|
failed+=("$service_name")
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
# 汇总
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "构建结果"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
if [ ${#succeeded[@]} -gt 0 ]; then
|
||||||
|
log_info "成功 (${#succeeded[@]}): ${succeeded[*]}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${#failed[@]} -gt 0 ]; then
|
||||||
|
log_error "失败 (${#failed[@]}): ${failed[*]}"
|
||||||
|
|
||||||
|
log_info ""
|
||||||
|
log_info "提示: 如果失败是因为缺少预装基础镜像,请确保:"
|
||||||
|
log_info " 1. 在有网环境执行: ./scripts/offline/build-base-images.sh"
|
||||||
|
log_info " 2. 将生成的 base-images-with-apt.tar 传输到无网环境"
|
||||||
|
log_info " 3. 在无网环境加载: docker load -i base-images-with-apt.tar"
|
||||||
|
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
log_info "✓ 所有服务构建成功!"
|
||||||
|
echo ""
|
||||||
|
docker images --format "table {{.Repository}}:{{.Tag}}\t{{.Size}}" | grep -E "(datamate-|deer-flow-)" || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@@ -8,6 +8,7 @@ OUTPUT_DIR="${1:-./build-cache}"
|
|||||||
BUILDKIT_CACHE_DIR="$OUTPUT_DIR/buildkit"
|
BUILDKIT_CACHE_DIR="$OUTPUT_DIR/buildkit"
|
||||||
IMAGES_DIR="$OUTPUT_DIR/images"
|
IMAGES_DIR="$OUTPUT_DIR/images"
|
||||||
RESOURCES_DIR="$OUTPUT_DIR/resources"
|
RESOURCES_DIR="$OUTPUT_DIR/resources"
|
||||||
|
APT_CACHE_DIR="$OUTPUT_DIR/apt-cache"
|
||||||
|
|
||||||
# 确保 buildx 构建器存在
|
# 确保 buildx 构建器存在
|
||||||
if ! docker buildx inspect offline-builder > /dev/null 2>&1; then
|
if ! docker buildx inspect offline-builder > /dev/null 2>&1; then
|
||||||
@@ -17,7 +18,7 @@ else
|
|||||||
docker buildx use offline-builder
|
docker buildx use offline-builder
|
||||||
fi
|
fi
|
||||||
|
|
||||||
mkdir -p "$BUILDKIT_CACHE_DIR" "$IMAGES_DIR" "$RESOURCES_DIR"
|
mkdir -p "$BUILDKIT_CACHE_DIR" "$IMAGES_DIR" "$RESOURCES_DIR" "$APT_CACHE_DIR"
|
||||||
|
|
||||||
echo "======================================"
|
echo "======================================"
|
||||||
echo "1. 导出基础镜像"
|
echo "1. 导出基础镜像"
|
||||||
@@ -117,11 +118,42 @@ fi
|
|||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "======================================"
|
echo "======================================"
|
||||||
echo "4. 打包缓存"
|
echo "4. 导出 APT 缓存"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
# 为需要 apt 的镜像预生成 apt 缓存
|
||||||
|
echo "生成 APT list 缓存..."
|
||||||
|
|
||||||
|
# eclipse-temurin:21-jdk 的 apt 缓存
|
||||||
|
docker run --rm \
|
||||||
|
-v "$APT_CACHE_DIR/eclipse-temurin:/var/cache/apt/archives" \
|
||||||
|
-v "$APT_CACHE_DIR/eclipse-temurin-lists:/var/lib/apt/lists" \
|
||||||
|
eclipse-temurin:21-jdk \
|
||||||
|
bash -c "apt-get update && apt-get install -y --download-only vim wget curl rsync python3 python3-pip python-is-python3 dos2unix libreoffice fonts-noto-cjk 2>/dev/null || true" 2>/dev/null || echo " Warning: eclipse-temurin apt 缓存导出失败"
|
||||||
|
|
||||||
|
# python:3.12-slim 的 apt 缓存
|
||||||
|
docker run --rm \
|
||||||
|
-v "$APT_CACHE_DIR/python312:/var/cache/apt/archives" \
|
||||||
|
-v "$APT_CACHE_DIR/python312-lists:/var/lib/apt/lists" \
|
||||||
|
python:3.12-slim \
|
||||||
|
bash -c "apt-get update && apt-get install -y --download-only vim openjdk-21-jre nfs-common glusterfs-client rsync 2>/dev/null || true" 2>/dev/null || echo " Warning: python3.12 apt 缓存导出失败"
|
||||||
|
|
||||||
|
# python:3.11-slim 的 apt 缓存
|
||||||
|
docker run --rm \
|
||||||
|
-v "$APT_CACHE_DIR/python311:/var/cache/apt/archives" \
|
||||||
|
-v "$APT_CACHE_DIR/python311-lists:/var/lib/apt/lists" \
|
||||||
|
python:3.11-slim \
|
||||||
|
bash -c "apt-get update && apt-get install -y --download-only curl vim libgl1 libglx0 libopengl0 libglib2.0-0 procps 2>/dev/null || true" 2>/dev/null || echo " Warning: python3.11 apt 缓存导出失败"
|
||||||
|
|
||||||
|
echo "✓ APT 缓存导出完成"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "5. 打包缓存"
|
||||||
echo "======================================"
|
echo "======================================"
|
||||||
|
|
||||||
cd "$OUTPUT_DIR"
|
cd "$OUTPUT_DIR"
|
||||||
tar -czf "build-cache-$(date +%Y%m%d).tar.gz" buildkit images resources
|
tar -czf "build-cache-$(date +%Y%m%d).tar.gz" buildkit images resources apt-cache
|
||||||
cd - > /dev/null
|
cd - > /dev/null
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
@@ -131,4 +163,10 @@ echo "======================================"
|
|||||||
echo "缓存位置: $OUTPUT_DIR"
|
echo "缓存位置: $OUTPUT_DIR"
|
||||||
echo "传输文件: $OUTPUT_DIR/build-cache-$(date +%Y%m%d).tar.gz"
|
echo "传输文件: $OUTPUT_DIR/build-cache-$(date +%Y%m%d).tar.gz"
|
||||||
echo ""
|
echo ""
|
||||||
|
echo "包含内容:"
|
||||||
|
echo " - 基础镜像 (images/)"
|
||||||
|
echo " - BuildKit 缓存 (buildkit/)"
|
||||||
|
echo " - 外部资源 (resources/)"
|
||||||
|
echo " - APT 缓存 (apt-cache/)"
|
||||||
|
echo ""
|
||||||
echo "请将此压缩包传输到无网环境后解压使用"
|
echo "请将此压缩包传输到无网环境后解压使用"
|
||||||
|
|||||||
Reference in New Issue
Block a user