You've already forked DataMate
Compare commits
52 Commits
27e27a09d4
...
lsf
| Author | SHA1 | Date | |
|---|---|---|---|
| d0972cbc9d | |||
| 473f4e717f | |||
| 6b0042cb66 | |||
| fa9e9d9f68 | |||
| 707e65b017 | |||
| cda22a720c | |||
| 394e2bda18 | |||
| 4220284f5a | |||
| 8415166949 | |||
| 078f303f57 | |||
| 50f2da5503 | |||
| 3af1daf8b6 | |||
| 7c7729434b | |||
| 17a62cd3c2 | |||
| f381d641ab | |||
| c8611d29ff | |||
| 147beb1ec7 | |||
| 699031dae7 | |||
| 88b1383653 | |||
| cc6415c4d9 | |||
| 3d036c4cd6 | |||
| 2445235fd2 | |||
| 893e0a1580 | |||
| 05e6842fc8 | |||
| da5b18e423 | |||
| 31629ab50b | |||
| fb43052ddf | |||
| c44c75be25 | |||
| 05f3efc148 | |||
| 16eb5cacf9 | |||
| e71116d117 | |||
| cac53d7aac | |||
| 43b4a619bc | |||
| 9da187d2c6 | |||
| b36fdd2438 | |||
| daa63bdd13 | |||
| 85433ac071 | |||
| fc2e50b415 | |||
| 26e1ae69d7 | |||
| 7092c3f955 | |||
| b2bdf9e066 | |||
| a5261b33b2 | |||
|
|
52daf30869 | ||
| 07a901043a | |||
| 32e3fc97c6 | |||
| a73571bd73 | |||
| 00fa1b86eb | |||
| 626c0fcd9a | |||
| 2f2e0d6a8d | |||
| 10fad39e02 | |||
| 9014dca1ac | |||
| 0b8fe34586 |
304
Makefile.offline.mk
Normal file
304
Makefile.offline.mk
Normal file
@@ -0,0 +1,304 @@
|
|||||||
|
# ============================================================================
|
||||||
|
# Makefile 离线构建扩展
|
||||||
|
# 将此文件内容追加到主 Makefile 末尾,或单独包含使用
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# 离线构建配置
|
||||||
|
CACHE_DIR ?= ./build-cache
|
||||||
|
OFFLINE_VERSION ?= latest
|
||||||
|
|
||||||
|
# 创建 buildx 构建器(如果不存在)
|
||||||
|
.PHONY: ensure-buildx
|
||||||
|
ensure-buildx:
|
||||||
|
@if ! docker buildx inspect offline-builder > /dev/null 2>&1; then \
|
||||||
|
echo "创建 buildx 构建器..."; \
|
||||||
|
docker buildx create --name offline-builder --driver docker-container --use 2>/dev/null || docker buildx use offline-builder; \
|
||||||
|
else \
|
||||||
|
docker buildx use offline-builder 2>/dev/null || true; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ========== 离线缓存导出(有网环境) ==========
|
||||||
|
|
||||||
|
.PHONY: offline-export
|
||||||
|
offline-export: ensure-buildx
|
||||||
|
@echo "======================================"
|
||||||
|
@echo "导出离线构建缓存..."
|
||||||
|
@echo "======================================"
|
||||||
|
@mkdir -p $(CACHE_DIR)/buildkit $(CACHE_DIR)/images $(CACHE_DIR)/resources
|
||||||
|
@$(MAKE) _offline-export-base-images
|
||||||
|
@$(MAKE) _offline-export-cache
|
||||||
|
@$(MAKE) _offline-export-resources
|
||||||
|
@$(MAKE) _offline-package
|
||||||
|
|
||||||
|
.PHONY: _offline-export-base-images
|
||||||
|
_offline-export-base-images:
|
||||||
|
@echo ""
|
||||||
|
@echo "1. 导出基础镜像..."
|
||||||
|
@bash -c 'images=( \
|
||||||
|
"maven:3-eclipse-temurin-21" \
|
||||||
|
"maven:3-eclipse-temurin-8" \
|
||||||
|
"eclipse-temurin:21-jdk" \
|
||||||
|
"mysql:8" \
|
||||||
|
"node:20-alpine" \
|
||||||
|
"nginx:1.29" \
|
||||||
|
"ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm" \
|
||||||
|
"ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm" \
|
||||||
|
"ghcr.nju.edu.cn/astral-sh/uv:latest" \
|
||||||
|
"python:3.12-slim" \
|
||||||
|
"python:3.11-slim" \
|
||||||
|
"gcr.nju.edu.cn/distroless/nodejs20-debian12" \
|
||||||
|
); for img in "$${images[@]}"; do echo " Pulling $$img..."; docker pull "$$img" 2>/dev/null || true; done'
|
||||||
|
@echo " Saving base images..."
|
||||||
|
@docker save -o $(CACHE_DIR)/images/base-images.tar \
|
||||||
|
maven:3-eclipse-temurin-21 \
|
||||||
|
maven:3-eclipse-temurin-8 \
|
||||||
|
eclipse-temurin:21-jdk \
|
||||||
|
mysql:8 \
|
||||||
|
node:20-alpine \
|
||||||
|
nginx:1.29 \
|
||||||
|
ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm \
|
||||||
|
ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm \
|
||||||
|
ghcr.nju.edu.cn/astral-sh/uv:latest \
|
||||||
|
python:3.12-slim \
|
||||||
|
python:3.11-slim \
|
||||||
|
gcr.nju.edu.cn/distroless/nodejs20-debian12 2>/dev/null || echo " Warning: Some images may not exist"
|
||||||
|
|
||||||
|
.PHONY: _offline-export-cache
|
||||||
|
_offline-export-cache:
|
||||||
|
@echo ""
|
||||||
|
@echo "2. 导出 BuildKit 缓存..."
|
||||||
|
@echo " backend..."
|
||||||
|
@docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/backend-cache,mode=max -f scripts/images/backend/Dockerfile -t datamate-backend:cache . 2>/dev/null || echo " Warning: backend cache export failed"
|
||||||
|
@echo " backend-python..."
|
||||||
|
@docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/backend-python-cache,mode=max -f scripts/images/backend-python/Dockerfile -t datamate-backend-python:cache . 2>/dev/null || echo " Warning: backend-python cache export failed"
|
||||||
|
@echo " database..."
|
||||||
|
@docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/database-cache,mode=max -f scripts/images/database/Dockerfile -t datamate-database:cache . 2>/dev/null || echo " Warning: database cache export failed"
|
||||||
|
@echo " frontend..."
|
||||||
|
@docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/frontend-cache,mode=max -f scripts/images/frontend/Dockerfile -t datamate-frontend:cache . 2>/dev/null || echo " Warning: frontend cache export failed"
|
||||||
|
@echo " gateway..."
|
||||||
|
@docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/gateway-cache,mode=max -f scripts/images/gateway/Dockerfile -t datamate-gateway:cache . 2>/dev/null || echo " Warning: gateway cache export failed"
|
||||||
|
@echo " runtime..."
|
||||||
|
@docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/runtime-cache,mode=max -f scripts/images/runtime/Dockerfile -t datamate-runtime:cache . 2>/dev/null || echo " Warning: runtime cache export failed"
|
||||||
|
@echo " deer-flow-backend..."
|
||||||
|
@docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/deer-flow-backend-cache,mode=max -f scripts/images/deer-flow-backend/Dockerfile -t deer-flow-backend:cache . 2>/dev/null || echo " Warning: deer-flow-backend cache export failed"
|
||||||
|
@echo " deer-flow-frontend..."
|
||||||
|
@docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/deer-flow-frontend-cache,mode=max -f scripts/images/deer-flow-frontend/Dockerfile -t deer-flow-frontend:cache . 2>/dev/null || echo " Warning: deer-flow-frontend cache export failed"
|
||||||
|
@echo " mineru..."
|
||||||
|
@docker buildx build --cache-to type=local,dest=$(CACHE_DIR)/buildkit/mineru-cache,mode=max -f scripts/images/mineru/Dockerfile -t datamate-mineru:cache . 2>/dev/null || echo " Warning: mineru cache export failed"
|
||||||
|
|
||||||
|
.PHONY: _offline-export-resources
|
||||||
|
_offline-export-resources:
|
||||||
|
@echo ""
|
||||||
|
@echo "3. 预下载外部资源..."
|
||||||
|
@mkdir -p $(CACHE_DIR)/resources/models
|
||||||
|
@echo " PaddleOCR model..."
|
||||||
|
@wget -q -O $(CACHE_DIR)/resources/models/ch_ppocr_mobile_v2.0_cls_infer.tar \
|
||||||
|
https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar 2>/dev/null || echo " Warning: PaddleOCR model download failed"
|
||||||
|
@echo " spaCy model..."
|
||||||
|
@wget -q -O $(CACHE_DIR)/resources/models/zh_core_web_sm-3.8.0-py3-none-any.whl \
|
||||||
|
https://ghproxy.net/https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl 2>/dev/null || echo " Warning: spaCy model download failed"
|
||||||
|
@echo " DataX source..."
|
||||||
|
@if [ ! -d "$(CACHE_DIR)/resources/DataX" ]; then \
|
||||||
|
git clone --depth 1 https://gitee.com/alibaba/DataX.git $(CACHE_DIR)/resources/DataX 2>/dev/null || echo " Warning: DataX clone failed"; \
|
||||||
|
fi
|
||||||
|
@echo " deer-flow source..."
|
||||||
|
@if [ ! -d "$(CACHE_DIR)/resources/deer-flow" ]; then \
|
||||||
|
git clone --depth 1 https://ghproxy.net/https://github.com/ModelEngine-Group/deer-flow.git $(CACHE_DIR)/resources/deer-flow 2>/dev/null || echo " Warning: deer-flow clone failed"; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
.PHONY: _offline-package
|
||||||
|
_offline-package:
|
||||||
|
@echo ""
|
||||||
|
@echo "4. 打包缓存..."
|
||||||
|
@cd $(CACHE_DIR) && tar -czf "build-cache-$$(date +%Y%m%d).tar.gz" buildkit images resources 2>/dev/null && cd - > /dev/null
|
||||||
|
@echo ""
|
||||||
|
@echo "======================================"
|
||||||
|
@echo "✓ 缓存导出完成!"
|
||||||
|
@echo "======================================"
|
||||||
|
@echo "传输文件: $(CACHE_DIR)/build-cache-$$(date +%Y%m%d).tar.gz"
|
||||||
|
|
||||||
|
# ========== 离线构建(无网环境) ==========
|
||||||
|
|
||||||
|
.PHONY: offline-setup
|
||||||
|
offline-setup:
|
||||||
|
@echo "======================================"
|
||||||
|
@echo "设置离线构建环境..."
|
||||||
|
@echo "======================================"
|
||||||
|
@if [ ! -d "$(CACHE_DIR)" ]; then \
|
||||||
|
echo "查找并解压缓存包..."; \
|
||||||
|
cache_file=$$(ls -t build-cache-*.tar.gz 2>/dev/null | head -1); \
|
||||||
|
if [ -z "$$cache_file" ]; then \
|
||||||
|
echo "错误: 未找到缓存压缩包 (build-cache-*.tar.gz)"; \
|
||||||
|
exit 1; \
|
||||||
|
fi; \
|
||||||
|
echo "解压 $$cache_file..."; \
|
||||||
|
tar -xzf "$$cache_file"; \
|
||||||
|
else \
|
||||||
|
echo "缓存目录已存在: $(CACHE_DIR)"; \
|
||||||
|
fi
|
||||||
|
@echo ""
|
||||||
|
@echo "加载基础镜像..."
|
||||||
|
@if [ -f "$(CACHE_DIR)/images/base-images.tar" ]; then \
|
||||||
|
docker load -i $(CACHE_DIR)/images/base-images.tar; \
|
||||||
|
else \
|
||||||
|
echo "警告: 基础镜像文件不存在,假设已手动加载"; \
|
||||||
|
fi
|
||||||
|
@$(MAKE) ensure-buildx
|
||||||
|
@echo ""
|
||||||
|
@echo "✓ 离线环境准备完成"
|
||||||
|
|
||||||
|
.PHONY: offline-build
|
||||||
|
offline-build: offline-setup
|
||||||
|
@echo ""
|
||||||
|
@echo "======================================"
|
||||||
|
@echo "开始离线构建..."
|
||||||
|
@echo "======================================"
|
||||||
|
@$(MAKE) _offline-build-services
|
||||||
|
|
||||||
|
.PHONY: _offline-build-services
|
||||||
|
_offline-build-services: ensure-buildx
|
||||||
|
@echo ""
|
||||||
|
@echo "构建 datamate-database..."
|
||||||
|
@docker buildx build \
|
||||||
|
--cache-from type=local,src=$(CACHE_DIR)/buildkit/database-cache \
|
||||||
|
--pull=false \
|
||||||
|
-f scripts/images/database/Dockerfile \
|
||||||
|
-t datamate-database:$(OFFLINE_VERSION) \
|
||||||
|
--load . || echo " Failed"
|
||||||
|
|
||||||
|
@echo ""
|
||||||
|
@echo "构建 datamate-gateway..."
|
||||||
|
@docker buildx build \
|
||||||
|
--cache-from type=local,src=$(CACHE_DIR)/buildkit/gateway-cache \
|
||||||
|
--pull=false \
|
||||||
|
-f scripts/images/gateway/Dockerfile \
|
||||||
|
-t datamate-gateway:$(OFFLINE_VERSION) \
|
||||||
|
--load . || echo " Failed"
|
||||||
|
|
||||||
|
@echo ""
|
||||||
|
@echo "构建 datamate-backend..."
|
||||||
|
@docker buildx build \
|
||||||
|
--cache-from type=local,src=$(CACHE_DIR)/buildkit/backend-cache \
|
||||||
|
--pull=false \
|
||||||
|
-f scripts/images/backend/Dockerfile \
|
||||||
|
-t datamate-backend:$(OFFLINE_VERSION) \
|
||||||
|
--load . || echo " Failed"
|
||||||
|
|
||||||
|
@echo ""
|
||||||
|
@echo "构建 datamate-frontend..."
|
||||||
|
@docker buildx build \
|
||||||
|
--cache-from type=local,src=$(CACHE_DIR)/buildkit/frontend-cache \
|
||||||
|
--pull=false \
|
||||||
|
-f scripts/images/frontend/Dockerfile \
|
||||||
|
-t datamate-frontend:$(OFFLINE_VERSION) \
|
||||||
|
--load . || echo " Failed"
|
||||||
|
|
||||||
|
@echo ""
|
||||||
|
@echo "构建 datamate-runtime..."
|
||||||
|
@docker buildx build \
|
||||||
|
--cache-from type=local,src=$(CACHE_DIR)/buildkit/runtime-cache \
|
||||||
|
--pull=false \
|
||||||
|
--build-arg RESOURCES_DIR=$(CACHE_DIR)/resources \
|
||||||
|
-f scripts/images/runtime/Dockerfile \
|
||||||
|
-t datamate-runtime:$(OFFLINE_VERSION) \
|
||||||
|
--load . || echo " Failed"
|
||||||
|
|
||||||
|
@echo ""
|
||||||
|
@echo "构建 datamate-backend-python..."
|
||||||
|
@docker buildx build \
|
||||||
|
--cache-from type=local,src=$(CACHE_DIR)/buildkit/backend-python-cache \
|
||||||
|
--pull=false \
|
||||||
|
--build-arg RESOURCES_DIR=$(CACHE_DIR)/resources \
|
||||||
|
-f scripts/images/backend-python/Dockerfile \
|
||||||
|
-t datamate-backend-python:$(OFFLINE_VERSION) \
|
||||||
|
--load . || echo " Failed"
|
||||||
|
|
||||||
|
@echo ""
|
||||||
|
@echo "======================================"
|
||||||
|
@echo "✓ 离线构建完成"
|
||||||
|
@echo "======================================"
|
||||||
|
|
||||||
|
# 单个服务离线构建 (BuildKit)
|
||||||
|
.PHONY: %-offline-build
|
||||||
|
%-offline-build: offline-setup ensure-buildx
|
||||||
|
@echo "离线构建 $*..."
|
||||||
|
@if [ ! -d "$(CACHE_DIR)/buildkit/$*-cache" ]; then \
|
||||||
|
echo "错误: $* 的缓存不存在"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
@$(eval IMAGE_NAME := $(if $(filter deer-flow%,$*),$*,datamate-$*))
|
||||||
|
@docker buildx build \
|
||||||
|
--cache-from type=local,src=$(CACHE_DIR)/buildkit/$*-cache \
|
||||||
|
--pull=false \
|
||||||
|
$(if $(filter runtime backend-python deer-flow%,$*),--build-arg RESOURCES_DIR=$(CACHE_DIR)/resources,) \
|
||||||
|
-f scripts/images/$*/Dockerfile \
|
||||||
|
-t $(IMAGE_NAME):$(OFFLINE_VERSION) \
|
||||||
|
--load .
|
||||||
|
|
||||||
|
# 传统 Docker 构建(不使用 BuildKit,更稳定)
|
||||||
|
.PHONY: offline-build-classic
|
||||||
|
offline-build-classic: offline-setup
|
||||||
|
@echo "使用传统 docker build 进行离线构建..."
|
||||||
|
@bash scripts/offline/build-offline-classic.sh $(CACHE_DIR) $(OFFLINE_VERSION)
|
||||||
|
|
||||||
|
# 诊断离线环境
|
||||||
|
.PHONY: offline-diagnose
|
||||||
|
offline-diagnose:
|
||||||
|
@bash scripts/offline/diagnose.sh $(CACHE_DIR)
|
||||||
|
|
||||||
|
# 构建 APT 预装基础镜像(有网环境)
|
||||||
|
.PHONY: offline-build-base-images
|
||||||
|
offline-build-base-images:
|
||||||
|
@echo "构建 APT 预装基础镜像..."
|
||||||
|
@bash scripts/offline/build-base-images.sh $(CACHE_DIR)
|
||||||
|
|
||||||
|
# 使用预装基础镜像进行离线构建(推荐)
|
||||||
|
.PHONY: offline-build-final
|
||||||
|
offline-build-final: offline-setup
|
||||||
|
@echo "使用预装 APT 包的基础镜像进行离线构建..."
|
||||||
|
@bash scripts/offline/build-offline-final.sh $(CACHE_DIR) $(OFFLINE_VERSION)
|
||||||
|
|
||||||
|
# 完整离线导出(包含 APT 预装基础镜像)
|
||||||
|
.PHONY: offline-export-full
|
||||||
|
offline-export-full:
|
||||||
|
@echo "======================================"
|
||||||
|
@echo "完整离线缓存导出(含 APT 预装基础镜像)"
|
||||||
|
@echo "======================================"
|
||||||
|
@$(MAKE) offline-build-base-images
|
||||||
|
@$(MAKE) offline-export
|
||||||
|
@echo ""
|
||||||
|
@echo "导出完成!传输时请包含以下文件:"
|
||||||
|
@echo " - build-cache/images/base-images-with-apt.tar"
|
||||||
|
@echo " - build-cache-YYYYMMDD.tar.gz"
|
||||||
|
|
||||||
|
# ========== 帮助 ==========
|
||||||
|
|
||||||
|
.PHONY: help-offline
|
||||||
|
help-offline:
|
||||||
|
@echo "离线构建命令:"
|
||||||
|
@echo ""
|
||||||
|
@echo "【有网环境】"
|
||||||
|
@echo " make offline-export [CACHE_DIR=./build-cache] - 导出构建缓存"
|
||||||
|
@echo " make offline-export-full - 导出完整缓存(含 APT 预装基础镜像)"
|
||||||
|
@echo " make offline-build-base-images - 构建 APT 预装基础镜像"
|
||||||
|
@echo ""
|
||||||
|
@echo "【无网环境】"
|
||||||
|
@echo " make offline-setup [CACHE_DIR=./build-cache] - 解压并准备离线缓存"
|
||||||
|
@echo " make offline-build-final - 使用预装基础镜像构建(推荐,解决 APT 问题)"
|
||||||
|
@echo " make offline-build-classic - 使用传统 docker build"
|
||||||
|
@echo " make offline-build - 使用 BuildKit 构建"
|
||||||
|
@echo " make offline-diagnose - 诊断离线构建环境"
|
||||||
|
@echo " make <service>-offline-build - 离线构建单个服务"
|
||||||
|
@echo ""
|
||||||
|
@echo "【完整工作流程(推荐)】"
|
||||||
|
@echo " # 1. 有网环境导出完整缓存"
|
||||||
|
@echo " make offline-export-full"
|
||||||
|
@echo ""
|
||||||
|
@echo " # 2. 传输到无网环境(需要传输两个文件)"
|
||||||
|
@echo " scp build-cache/images/base-images-with-apt.tar user@offline-server:/path/"
|
||||||
|
@echo " scp build-cache-*.tar.gz user@offline-server:/path/"
|
||||||
|
@echo ""
|
||||||
|
@echo " # 3. 无网环境构建"
|
||||||
|
@echo " tar -xzf build-cache-*.tar.gz"
|
||||||
|
@echo " docker load -i build-cache/images/base-images-with-apt.tar"
|
||||||
|
@echo " make offline-build-final"
|
||||||
@@ -470,6 +470,23 @@ paths:
|
|||||||
'200':
|
'200':
|
||||||
description: 上传成功
|
description: 上传成功
|
||||||
|
|
||||||
|
/data-management/datasets/upload/cancel-upload/{reqId}:
|
||||||
|
put:
|
||||||
|
tags: [ DatasetFile ]
|
||||||
|
operationId: cancelUpload
|
||||||
|
summary: 取消上传
|
||||||
|
description: 取消预上传请求并清理临时分片
|
||||||
|
parameters:
|
||||||
|
- name: reqId
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
description: 预上传请求ID
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: 取消成功
|
||||||
|
|
||||||
/data-management/dataset-types:
|
/data-management/dataset-types:
|
||||||
get:
|
get:
|
||||||
operationId: getDatasetTypes
|
operationId: getDatasetTypes
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
package com.datamate.datamanagement.application;
|
package com.datamate.datamanagement.application;
|
||||||
|
|
||||||
|
import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
|
||||||
import com.baomidou.mybatisplus.core.metadata.IPage;
|
import com.baomidou.mybatisplus.core.metadata.IPage;
|
||||||
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
|
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
|
||||||
import com.datamate.common.domain.utils.ChunksSaver;
|
import com.datamate.common.domain.utils.ChunksSaver;
|
||||||
@@ -101,6 +102,7 @@ public class DatasetApplicationService {
|
|||||||
public Dataset updateDataset(String datasetId, UpdateDatasetRequest updateDatasetRequest) {
|
public Dataset updateDataset(String datasetId, UpdateDatasetRequest updateDatasetRequest) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
|
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
|
||||||
|
|
||||||
if (StringUtils.hasText(updateDatasetRequest.getName())) {
|
if (StringUtils.hasText(updateDatasetRequest.getName())) {
|
||||||
dataset.setName(updateDatasetRequest.getName());
|
dataset.setName(updateDatasetRequest.getName());
|
||||||
}
|
}
|
||||||
@@ -113,13 +115,31 @@ public class DatasetApplicationService {
|
|||||||
if (Objects.nonNull(updateDatasetRequest.getStatus())) {
|
if (Objects.nonNull(updateDatasetRequest.getStatus())) {
|
||||||
dataset.setStatus(updateDatasetRequest.getStatus());
|
dataset.setStatus(updateDatasetRequest.getStatus());
|
||||||
}
|
}
|
||||||
if (updateDatasetRequest.getParentDatasetId() != null) {
|
if (updateDatasetRequest.isParentDatasetIdProvided()) {
|
||||||
|
// 保存原始的 parentDatasetId 值,用于比较是否发生了变化
|
||||||
|
String originalParentDatasetId = dataset.getParentDatasetId();
|
||||||
|
|
||||||
|
// 处理父数据集变更:仅当请求显式包含 parentDatasetId 时处理
|
||||||
|
// handleParentChange 内部通过 normalizeParentId 方法将空字符串和 null 都转换为 null
|
||||||
|
// 这样既支持设置新的父数据集,也支持清除关联
|
||||||
handleParentChange(dataset, updateDatasetRequest.getParentDatasetId());
|
handleParentChange(dataset, updateDatasetRequest.getParentDatasetId());
|
||||||
|
|
||||||
|
// 检查 parentDatasetId 是否发生了变化
|
||||||
|
if (!Objects.equals(originalParentDatasetId, dataset.getParentDatasetId())) {
|
||||||
|
// 使用 LambdaUpdateWrapper 显式地更新 parentDatasetId 字段
|
||||||
|
// 这样即使值为 null 也能被正确更新到数据库
|
||||||
|
datasetRepository.update(null, new LambdaUpdateWrapper<Dataset>()
|
||||||
|
.eq(Dataset::getId, datasetId)
|
||||||
|
.set(Dataset::getParentDatasetId, dataset.getParentDatasetId()));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (StringUtils.hasText(updateDatasetRequest.getDataSource())) {
|
if (StringUtils.hasText(updateDatasetRequest.getDataSource())) {
|
||||||
// 数据源id不为空,使用异步线程进行文件扫盘落库
|
// 数据源id不为空,使用异步线程进行文件扫盘落库
|
||||||
processDataSourceAsync(dataset.getId(), updateDatasetRequest.getDataSource());
|
processDataSourceAsync(dataset.getId(), updateDatasetRequest.getDataSource());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 更新其他字段(不包括 parentDatasetId,因为它已经在上面的代码中更新了)
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
return dataset;
|
return dataset;
|
||||||
}
|
}
|
||||||
@@ -144,7 +164,7 @@ public class DatasetApplicationService {
|
|||||||
public Dataset getDataset(String datasetId) {
|
public Dataset getDataset(String datasetId) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
|
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
|
||||||
List<DatasetFile> datasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
List<DatasetFile> datasetFiles = datasetFileRepository.findAllVisibleByDatasetId(datasetId);
|
||||||
dataset.setFiles(datasetFiles);
|
dataset.setFiles(datasetFiles);
|
||||||
applyVisibleFileCounts(Collections.singletonList(dataset));
|
applyVisibleFileCounts(Collections.singletonList(dataset));
|
||||||
return dataset;
|
return dataset;
|
||||||
@@ -419,7 +439,7 @@ public class DatasetApplicationService {
|
|||||||
|
|
||||||
Map<String, Object> statistics = new HashMap<>();
|
Map<String, Object> statistics = new HashMap<>();
|
||||||
|
|
||||||
List<DatasetFile> allFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
List<DatasetFile> allFiles = datasetFileRepository.findAllVisibleByDatasetId(datasetId);
|
||||||
List<DatasetFile> visibleFiles = filterVisibleFiles(allFiles);
|
List<DatasetFile> visibleFiles = filterVisibleFiles(allFiles);
|
||||||
long totalFiles = visibleFiles.size();
|
long totalFiles = visibleFiles.size();
|
||||||
long completedFiles = visibleFiles.stream()
|
long completedFiles = visibleFiles.stream()
|
||||||
|
|||||||
@@ -58,7 +58,6 @@ import java.time.LocalDateTime;
|
|||||||
import java.time.ZoneId;
|
import java.time.ZoneId;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.CompletableFuture;
|
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
@@ -83,6 +82,11 @@ public class DatasetFileApplicationService {
|
|||||||
XLSX_FILE_TYPE
|
XLSX_FILE_TYPE
|
||||||
);
|
);
|
||||||
private static final String DERIVED_METADATA_KEY = "derived_from_file_id";
|
private static final String DERIVED_METADATA_KEY = "derived_from_file_id";
|
||||||
|
private static final String FILE_STATUS_ACTIVE = "ACTIVE";
|
||||||
|
private static final String FILE_STATUS_ARCHIVED = "ARCHIVED";
|
||||||
|
private static final String INTERNAL_DIR_NAME = ".datamate";
|
||||||
|
private static final String INTERNAL_UPLOAD_DIR_NAME = "uploading";
|
||||||
|
private static final String INTERNAL_VERSIONS_DIR_NAME = "versions";
|
||||||
|
|
||||||
private final DatasetFileRepository datasetFileRepository;
|
private final DatasetFileRepository datasetFileRepository;
|
||||||
private final DatasetRepository datasetRepository;
|
private final DatasetRepository datasetRepository;
|
||||||
@@ -93,7 +97,7 @@ public class DatasetFileApplicationService {
|
|||||||
@Value("${datamate.data-management.base-path:/dataset}")
|
@Value("${datamate.data-management.base-path:/dataset}")
|
||||||
private String datasetBasePath;
|
private String datasetBasePath;
|
||||||
|
|
||||||
@Value("${datamate.data-management.file.duplicate:COVER}")
|
@Value("${datamate.data-management.file.duplicate:VERSION}")
|
||||||
private DuplicateMethod duplicateMethod;
|
private DuplicateMethod duplicateMethod;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
@@ -162,9 +166,19 @@ public class DatasetFileApplicationService {
|
|||||||
if (dataset == null) {
|
if (dataset == null) {
|
||||||
return PagedResponse.of(new Page<>(page, size));
|
return PagedResponse.of(new Page<>(page, size));
|
||||||
}
|
}
|
||||||
String datasetPath = dataset.getPath();
|
Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize();
|
||||||
Path queryPath = Path.of(dataset.getPath() + File.separator + prefix);
|
prefix = Optional.ofNullable(prefix).orElse("").trim().replace("\\", "/");
|
||||||
Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId)
|
while (prefix.startsWith("/")) {
|
||||||
|
prefix = prefix.substring(1);
|
||||||
|
}
|
||||||
|
if (prefix.equals(INTERNAL_DIR_NAME) || prefix.startsWith(INTERNAL_DIR_NAME + "/")) {
|
||||||
|
return new PagedResponse<>(page, size, 0, 0, Collections.emptyList());
|
||||||
|
}
|
||||||
|
Path queryPath = datasetRoot.resolve(prefix.replace("/", File.separator)).normalize();
|
||||||
|
if (!queryPath.startsWith(datasetRoot)) {
|
||||||
|
return new PagedResponse<>(page, size, 0, 0, Collections.emptyList());
|
||||||
|
}
|
||||||
|
Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllVisibleByDatasetId(datasetId)
|
||||||
.stream()
|
.stream()
|
||||||
.filter(file -> file.getFilePath() != null)
|
.filter(file -> file.getFilePath() != null)
|
||||||
.collect(Collectors.toMap(
|
.collect(Collectors.toMap(
|
||||||
@@ -186,7 +200,8 @@ public class DatasetFileApplicationService {
|
|||||||
}
|
}
|
||||||
try (Stream<Path> pathStream = Files.list(queryPath)) {
|
try (Stream<Path> pathStream = Files.list(queryPath)) {
|
||||||
List<Path> allFiles = pathStream
|
List<Path> allFiles = pathStream
|
||||||
.filter(path -> path.toString().startsWith(datasetPath))
|
.filter(path -> path.toAbsolutePath().normalize().startsWith(datasetRoot))
|
||||||
|
.filter(path -> !isInternalDatasetPath(datasetRoot, path))
|
||||||
.filter(path -> !excludeDerivedFiles
|
.filter(path -> !excludeDerivedFiles
|
||||||
|| Files.isDirectory(path)
|
|| Files.isDirectory(path)
|
||||||
|| !derivedFilePaths.contains(normalizeFilePath(path.toString())))
|
|| !derivedFilePaths.contains(normalizeFilePath(path.toString())))
|
||||||
@@ -298,6 +313,86 @@ public class DatasetFileApplicationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isSameNormalizedPath(String left, String right) {
|
||||||
|
String normalizedLeft = normalizeFilePath(left);
|
||||||
|
String normalizedRight = normalizeFilePath(right);
|
||||||
|
if (normalizedLeft == null || normalizedRight == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return normalizedLeft.equals(normalizedRight);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isInternalDatasetPath(Path datasetRoot, Path path) {
|
||||||
|
if (datasetRoot == null || path == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
Path normalizedRoot = datasetRoot.toAbsolutePath().normalize();
|
||||||
|
Path normalizedPath = path.toAbsolutePath().normalize();
|
||||||
|
if (!normalizedPath.startsWith(normalizedRoot)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
Path relative = normalizedRoot.relativize(normalizedPath);
|
||||||
|
if (relative.getNameCount() == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return INTERNAL_DIR_NAME.equals(relative.getName(0).toString());
|
||||||
|
} catch (Exception e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String normalizeLogicalPrefix(String prefix) {
|
||||||
|
if (prefix == null) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
String normalized = prefix.trim().replace("\\", "/");
|
||||||
|
while (normalized.startsWith("/")) {
|
||||||
|
normalized = normalized.substring(1);
|
||||||
|
}
|
||||||
|
while (normalized.endsWith("/")) {
|
||||||
|
normalized = normalized.substring(0, normalized.length() - 1);
|
||||||
|
}
|
||||||
|
while (normalized.contains("//")) {
|
||||||
|
normalized = normalized.replace("//", "/");
|
||||||
|
}
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String normalizeLogicalPath(String logicalPath) {
|
||||||
|
return normalizeLogicalPrefix(logicalPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String joinLogicalPath(String prefix, String relativePath) {
|
||||||
|
String normalizedPrefix = normalizeLogicalPrefix(prefix);
|
||||||
|
String normalizedRelative = normalizeLogicalPath(relativePath);
|
||||||
|
if (normalizedPrefix.isEmpty()) {
|
||||||
|
return normalizedRelative;
|
||||||
|
}
|
||||||
|
if (normalizedRelative.isEmpty()) {
|
||||||
|
return normalizedPrefix;
|
||||||
|
}
|
||||||
|
return normalizeLogicalPath(normalizedPrefix + "/" + normalizedRelative);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertNotInternalPrefix(String prefix) {
|
||||||
|
if (prefix == null || prefix.isBlank()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String normalized = normalizeLogicalPrefix(prefix);
|
||||||
|
if (normalized.equals(INTERNAL_DIR_NAME) || normalized.startsWith(INTERNAL_DIR_NAME + "/")) {
|
||||||
|
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isArchivedStatus(DatasetFile datasetFile) {
|
||||||
|
if (datasetFile == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
String status = datasetFile.getStatus();
|
||||||
|
return status != null && FILE_STATUS_ARCHIVED.equalsIgnoreCase(status);
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isSourceDocument(DatasetFile datasetFile) {
|
private boolean isSourceDocument(DatasetFile datasetFile) {
|
||||||
if (datasetFile == null) {
|
if (datasetFile == null) {
|
||||||
return false;
|
return false;
|
||||||
@@ -327,6 +422,144 @@ public class DatasetFileApplicationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Path resolveDatasetRootPath(Dataset dataset, String datasetId) {
|
||||||
|
String datasetPath = dataset == null ? null : dataset.getPath();
|
||||||
|
if (datasetPath == null || datasetPath.isBlank()) {
|
||||||
|
datasetPath = datasetBasePath + File.separator + datasetId;
|
||||||
|
if (dataset != null) {
|
||||||
|
dataset.setPath(datasetPath);
|
||||||
|
datasetRepository.updateById(dataset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Path datasetRoot = Paths.get(datasetPath).toAbsolutePath().normalize();
|
||||||
|
try {
|
||||||
|
Files.createDirectories(datasetRoot);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Failed to create dataset root dir: {}", datasetRoot, e);
|
||||||
|
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
|
||||||
|
}
|
||||||
|
return datasetRoot;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path resolveStagingRootPath(Path datasetRoot,
|
||||||
|
DatasetFileUploadCheckInfo checkInfo,
|
||||||
|
List<FileUploadResult> uploadedFiles) {
|
||||||
|
if (datasetRoot == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
String stagingPath = checkInfo == null ? null : checkInfo.getStagingPath();
|
||||||
|
if (stagingPath != null && !stagingPath.isBlank()) {
|
||||||
|
try {
|
||||||
|
Path stagingRoot = Paths.get(stagingPath).toAbsolutePath().normalize();
|
||||||
|
if (!stagingRoot.startsWith(datasetRoot)) {
|
||||||
|
log.warn("Staging root out of dataset root, datasetId={}, stagingRoot={}, datasetRoot={}",
|
||||||
|
checkInfo == null ? null : checkInfo.getDatasetId(), stagingRoot, datasetRoot);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
Path relative = datasetRoot.relativize(stagingRoot);
|
||||||
|
if (relative.getNameCount() < 3) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (!INTERNAL_DIR_NAME.equals(relative.getName(0).toString())
|
||||||
|
|| !INTERNAL_UPLOAD_DIR_NAME.equals(relative.getName(1).toString())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return stagingRoot;
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Invalid staging path: {}", stagingPath, e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (uploadedFiles == null || uploadedFiles.isEmpty()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
FileUploadResult firstResult = uploadedFiles.get(0);
|
||||||
|
File firstFile = firstResult == null ? null : firstResult.getSavedFile();
|
||||||
|
if (firstFile == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
return Paths.get(firstFile.getParent()).toAbsolutePath().normalize();
|
||||||
|
} catch (Exception e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void scheduleCleanupStagingDirAfterCommit(Path stagingRoot) {
|
||||||
|
if (stagingRoot == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Runnable cleanup = () -> deleteDirectoryRecursivelyQuietly(stagingRoot);
|
||||||
|
if (TransactionSynchronizationManager.isSynchronizationActive()) {
|
||||||
|
TransactionSynchronizationManager.registerSynchronization(new TransactionSynchronization() {
|
||||||
|
@Override
|
||||||
|
public void afterCommit() {
|
||||||
|
cleanup.run();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
cleanup.run();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void deleteDirectoryRecursivelyQuietly(Path directory) {
|
||||||
|
if (directory == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!Files.exists(directory)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try (Stream<Path> paths = Files.walk(directory)) {
|
||||||
|
paths.sorted(Comparator.reverseOrder()).forEach(path -> {
|
||||||
|
try {
|
||||||
|
Files.deleteIfExists(path);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.debug("Failed to delete: {}", path, e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.debug("Failed to cleanup staging dir: {}", directory, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String sanitizeArchiveFileName(String fileName) {
|
||||||
|
String input = fileName == null ? "" : fileName.trim();
|
||||||
|
if (input.isBlank()) {
|
||||||
|
return "file";
|
||||||
|
}
|
||||||
|
StringBuilder builder = new StringBuilder(input.length());
|
||||||
|
for (int i = 0; i < input.length(); i++) {
|
||||||
|
char c = input.charAt(i);
|
||||||
|
if (c <= 31 || c == 127) {
|
||||||
|
builder.append('_');
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (c == '/' || c == '\\' || c == ':' || c == '*' || c == '?' || c == '\"'
|
||||||
|
|| c == '<' || c == '>' || c == '|') {
|
||||||
|
builder.append('_');
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
builder.append(c);
|
||||||
|
}
|
||||||
|
String sanitized = builder.toString().trim();
|
||||||
|
return sanitized.isEmpty() ? "file" : sanitized;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String sha256Hex(String value) {
|
||||||
|
String input = value == null ? "" : value;
|
||||||
|
try {
|
||||||
|
java.security.MessageDigest digest = java.security.MessageDigest.getInstance("SHA-256");
|
||||||
|
byte[] hashed = digest.digest(input.getBytes(java.nio.charset.StandardCharsets.UTF_8));
|
||||||
|
StringBuilder builder = new StringBuilder(hashed.length * 2);
|
||||||
|
for (byte b : hashed) {
|
||||||
|
builder.append(String.format("%02x", b));
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
} catch (Exception e) {
|
||||||
|
return Integer.toHexString(input.hashCode());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取文件详情
|
* 获取文件详情
|
||||||
*/
|
*/
|
||||||
@@ -349,10 +582,12 @@ public class DatasetFileApplicationService {
|
|||||||
public void deleteDatasetFile(String datasetId, String fileId) {
|
public void deleteDatasetFile(String datasetId, String fileId) {
|
||||||
DatasetFile file = getDatasetFile(datasetId, fileId);
|
DatasetFile file = getDatasetFile(datasetId, fileId);
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
dataset.setFiles(new ArrayList<>(Collections.singleton(file)));
|
|
||||||
datasetFileRepository.removeById(fileId);
|
datasetFileRepository.removeById(fileId);
|
||||||
|
if (!isArchivedStatus(file)) {
|
||||||
|
dataset.setFiles(new ArrayList<>(Collections.singleton(file)));
|
||||||
dataset.removeFile(file);
|
dataset.removeFile(file);
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
|
}
|
||||||
datasetFilePreviewService.deletePreviewFileQuietly(datasetId, fileId);
|
datasetFilePreviewService.deletePreviewFileQuietly(datasetId, fileId);
|
||||||
// 删除文件时,上传到数据集中的文件会同时删除数据库中的记录和文件系统中的文件,归集过来的文件仅删除数据库中的记录
|
// 删除文件时,上传到数据集中的文件会同时删除数据库中的记录和文件系统中的文件,归集过来的文件仅删除数据库中的记录
|
||||||
if (file.getFilePath().startsWith(dataset.getPath())) {
|
if (file.getFilePath().startsWith(dataset.getPath())) {
|
||||||
@@ -393,18 +628,26 @@ public class DatasetFileApplicationService {
|
|||||||
if (Objects.isNull(dataset)) {
|
if (Objects.isNull(dataset)) {
|
||||||
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
|
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
|
||||||
}
|
}
|
||||||
List<DatasetFile> allByDatasetId = datasetFileRepository.findAllByDatasetId(datasetId);
|
Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize();
|
||||||
Set<String> filePaths = allByDatasetId.stream().map(DatasetFile::getFilePath).collect(Collectors.toSet());
|
Set<Path> filePaths = datasetFileRepository.findAllVisibleByDatasetId(datasetId).stream()
|
||||||
String datasetPath = dataset.getPath();
|
.map(DatasetFile::getFilePath)
|
||||||
Path downloadPath = Path.of(datasetPath);
|
.filter(Objects::nonNull)
|
||||||
|
.map(path -> Paths.get(path).toAbsolutePath().normalize())
|
||||||
|
.filter(path -> path.startsWith(datasetRoot))
|
||||||
|
.filter(path -> !isInternalDatasetPath(datasetRoot, path))
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
Path downloadPath = datasetRoot;
|
||||||
response.setContentType("application/zip");
|
response.setContentType("application/zip");
|
||||||
String zipName = String.format("dataset_%s.zip",
|
String zipName = String.format("dataset_%s.zip",
|
||||||
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")));
|
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")));
|
||||||
response.setHeader(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + zipName);
|
response.setHeader(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + zipName);
|
||||||
try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(response.getOutputStream())) {
|
try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(response.getOutputStream())) {
|
||||||
try (Stream<Path> pathStream = Files.walk(downloadPath)) {
|
try (Stream<Path> pathStream = Files.walk(downloadPath)) {
|
||||||
List<Path> allPaths = pathStream.filter(path -> path.toString().startsWith(datasetPath))
|
List<Path> allPaths = pathStream
|
||||||
.filter(path -> filePaths.stream().anyMatch(filePath -> filePath.startsWith(path.toString())))
|
.map(path -> path.toAbsolutePath().normalize())
|
||||||
|
.filter(path -> path.startsWith(datasetRoot))
|
||||||
|
.filter(path -> !isInternalDatasetPath(datasetRoot, path))
|
||||||
|
.filter(path -> filePaths.stream().anyMatch(filePath -> filePath.startsWith(path)))
|
||||||
.toList();
|
.toList();
|
||||||
for (Path path : allPaths) {
|
for (Path path : allPaths) {
|
||||||
addToZipFile(path, downloadPath, zos);
|
addToZipFile(path, downloadPath, zos);
|
||||||
@@ -461,29 +704,33 @@ public class DatasetFileApplicationService {
|
|||||||
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
|
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 构建上传路径,如果有 prefix 则追加到路径中
|
String prefix = normalizeLogicalPrefix(chunkUploadRequest == null ? null : chunkUploadRequest.getPrefix());
|
||||||
String prefix = Optional.ofNullable(chunkUploadRequest.getPrefix()).orElse("").trim();
|
assertNotInternalPrefix(prefix);
|
||||||
prefix = prefix.replace("\\", "/");
|
|
||||||
while (prefix.startsWith("/")) {
|
|
||||||
prefix = prefix.substring(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
String uploadPath = dataset.getPath();
|
Path datasetRoot = resolveDatasetRootPath(dataset, datasetId);
|
||||||
if (uploadPath == null || uploadPath.isBlank()) {
|
Path stagingRoot = datasetRoot
|
||||||
uploadPath = datasetBasePath + File.separator + datasetId;
|
.resolve(INTERNAL_DIR_NAME)
|
||||||
}
|
.resolve(INTERNAL_UPLOAD_DIR_NAME)
|
||||||
if (!prefix.isEmpty()) {
|
.resolve(UUID.randomUUID().toString())
|
||||||
uploadPath = uploadPath + File.separator + prefix.replace("/", File.separator);
|
.toAbsolutePath()
|
||||||
|
.normalize();
|
||||||
|
BusinessAssert.isTrue(stagingRoot.startsWith(datasetRoot), CommonErrorCode.PARAM_ERROR);
|
||||||
|
try {
|
||||||
|
Files.createDirectories(stagingRoot);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Failed to create staging dir: {}", stagingRoot, e);
|
||||||
|
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
|
||||||
}
|
}
|
||||||
|
|
||||||
ChunkUploadPreRequest request = ChunkUploadPreRequest.builder().build();
|
ChunkUploadPreRequest request = ChunkUploadPreRequest.builder().build();
|
||||||
request.setUploadPath(uploadPath);
|
request.setUploadPath(stagingRoot.toString());
|
||||||
request.setTotalFileNum(chunkUploadRequest.getTotalFileNum());
|
request.setTotalFileNum(chunkUploadRequest.getTotalFileNum());
|
||||||
request.setServiceId(DatasetConstant.SERVICE_ID);
|
request.setServiceId(DatasetConstant.SERVICE_ID);
|
||||||
DatasetFileUploadCheckInfo checkInfo = new DatasetFileUploadCheckInfo();
|
DatasetFileUploadCheckInfo checkInfo = new DatasetFileUploadCheckInfo();
|
||||||
checkInfo.setDatasetId(datasetId);
|
checkInfo.setDatasetId(datasetId);
|
||||||
checkInfo.setHasArchive(chunkUploadRequest.isHasArchive());
|
checkInfo.setHasArchive(chunkUploadRequest.isHasArchive());
|
||||||
checkInfo.setPrefix(prefix);
|
checkInfo.setPrefix(prefix);
|
||||||
|
checkInfo.setStagingPath(stagingRoot.toString());
|
||||||
try {
|
try {
|
||||||
ObjectMapper objectMapper = new ObjectMapper();
|
ObjectMapper objectMapper = new ObjectMapper();
|
||||||
String checkInfoJson = objectMapper.writeValueAsString(checkInfo);
|
String checkInfoJson = objectMapper.writeValueAsString(checkInfo);
|
||||||
@@ -505,6 +752,14 @@ public class DatasetFileApplicationService {
|
|||||||
saveFileInfoToDb(uploadResult, datasetId);
|
saveFileInfoToDb(uploadResult, datasetId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 取消上传
|
||||||
|
*/
|
||||||
|
@Transactional
|
||||||
|
public void cancelUpload(String reqId) {
|
||||||
|
fileService.cancelUpload(reqId);
|
||||||
|
}
|
||||||
|
|
||||||
private void saveFileInfoToDb(FileUploadResult fileUploadResult, String datasetId) {
|
private void saveFileInfoToDb(FileUploadResult fileUploadResult, String datasetId) {
|
||||||
if (Objects.isNull(fileUploadResult.getSavedFile())) {
|
if (Objects.isNull(fileUploadResult.getSavedFile())) {
|
||||||
// 文件切片上传没有完成
|
// 文件切片上传没有完成
|
||||||
@@ -527,32 +782,251 @@ public class DatasetFileApplicationService {
|
|||||||
} else {
|
} else {
|
||||||
files = Collections.singletonList(fileUploadResult);
|
files = Collections.singletonList(fileUploadResult);
|
||||||
}
|
}
|
||||||
addFileToDataset(datasetId, files);
|
commitUploadedFiles(datasetId, checkInfo, files, fileUploadResult.isAllFilesUploaded());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addFileToDataset(String datasetId, List<FileUploadResult> unpacked) {
|
private void commitUploadedFiles(String datasetId,
|
||||||
|
DatasetFileUploadCheckInfo checkInfo,
|
||||||
|
List<FileUploadResult> uploadedFiles,
|
||||||
|
boolean cleanupStagingAfterCommit) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
dataset.setFiles(datasetFileRepository.findAllByDatasetId(datasetId));
|
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
|
||||||
for (FileUploadResult file : unpacked) {
|
|
||||||
File savedFile = file.getSavedFile();
|
Path datasetRoot = resolveDatasetRootPath(dataset, datasetId);
|
||||||
|
String prefix = checkInfo == null ? "" : normalizeLogicalPrefix(checkInfo.getPrefix());
|
||||||
|
assertNotInternalPrefix(prefix);
|
||||||
|
|
||||||
|
Path stagingRoot = resolveStagingRootPath(datasetRoot, checkInfo, uploadedFiles);
|
||||||
|
BusinessAssert.notNull(stagingRoot, CommonErrorCode.PARAM_ERROR);
|
||||||
|
|
||||||
|
dataset.setFiles(new ArrayList<>(datasetFileRepository.findAllVisibleByDatasetId(datasetId)));
|
||||||
|
for (FileUploadResult fileResult : uploadedFiles) {
|
||||||
|
commitSingleUploadedFile(dataset, datasetRoot, stagingRoot, prefix, fileResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
dataset.active();
|
||||||
|
datasetRepository.updateById(dataset);
|
||||||
|
|
||||||
|
if (cleanupStagingAfterCommit) {
|
||||||
|
scheduleCleanupStagingDirAfterCommit(stagingRoot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void commitSingleUploadedFile(Dataset dataset,
|
||||||
|
Path datasetRoot,
|
||||||
|
Path stagingRoot,
|
||||||
|
String prefix,
|
||||||
|
FileUploadResult fileResult) {
|
||||||
|
if (dataset == null || fileResult == null || fileResult.getSavedFile() == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Path incomingPath = Paths.get(fileResult.getSavedFile().getPath()).toAbsolutePath().normalize();
|
||||||
|
BusinessAssert.isTrue(incomingPath.startsWith(stagingRoot), CommonErrorCode.PARAM_ERROR);
|
||||||
|
|
||||||
|
String relativePath = stagingRoot.relativize(incomingPath).toString().replace(File.separator, "/");
|
||||||
|
String logicalPath = joinLogicalPath(prefix, relativePath);
|
||||||
|
assertNotInternalPrefix(logicalPath);
|
||||||
|
|
||||||
|
commitNewFileVersion(dataset, datasetRoot, logicalPath, incomingPath, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private DatasetFile commitNewFileVersion(Dataset dataset,
|
||||||
|
Path datasetRoot,
|
||||||
|
String logicalPath,
|
||||||
|
Path incomingFilePath,
|
||||||
|
boolean moveIncoming) {
|
||||||
|
BusinessAssert.notNull(dataset, CommonErrorCode.PARAM_ERROR);
|
||||||
|
BusinessAssert.isTrue(datasetRoot != null && Files.exists(datasetRoot), CommonErrorCode.PARAM_ERROR);
|
||||||
|
|
||||||
|
String normalizedLogicalPath = normalizeLogicalPath(logicalPath);
|
||||||
|
BusinessAssert.isTrue(!normalizedLogicalPath.isEmpty(), CommonErrorCode.PARAM_ERROR);
|
||||||
|
assertNotInternalPrefix(normalizedLogicalPath);
|
||||||
|
|
||||||
|
Path targetFilePath = datasetRoot.resolve(normalizedLogicalPath.replace("/", File.separator))
|
||||||
|
.toAbsolutePath()
|
||||||
|
.normalize();
|
||||||
|
BusinessAssert.isTrue(targetFilePath.startsWith(datasetRoot), CommonErrorCode.PARAM_ERROR);
|
||||||
|
|
||||||
|
DuplicateMethod effectiveDuplicateMethod = resolveEffectiveDuplicateMethod();
|
||||||
|
DatasetFile latest = datasetFileRepository.findLatestByDatasetIdAndLogicalPath(dataset.getId(), normalizedLogicalPath);
|
||||||
|
if (latest == null && dataset.getFiles() != null) {
|
||||||
|
latest = dataset.getFiles().stream()
|
||||||
|
.filter(existing -> isSameNormalizedPath(existing == null ? null : existing.getFilePath(), targetFilePath.toString()))
|
||||||
|
.findFirst()
|
||||||
|
.orElse(null);
|
||||||
|
}
|
||||||
|
if (latest != null && effectiveDuplicateMethod == DuplicateMethod.ERROR) {
|
||||||
|
throw BusinessException.of(DataManagementErrorCode.DATASET_FILE_ALREADY_EXISTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
long nextVersion = 1L;
|
||||||
|
if (latest != null) {
|
||||||
|
long latestVersion = Optional.ofNullable(latest.getVersion()).orElse(1L);
|
||||||
|
if (latest.getVersion() == null) {
|
||||||
|
latest.setVersion(latestVersion);
|
||||||
|
}
|
||||||
|
if (latest.getLogicalPath() == null || latest.getLogicalPath().isBlank()) {
|
||||||
|
latest.setLogicalPath(normalizedLogicalPath);
|
||||||
|
}
|
||||||
|
nextVersion = latestVersion + 1L;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (latest != null && effectiveDuplicateMethod == DuplicateMethod.VERSION) {
|
||||||
|
Path archivedPath = archiveDatasetFileVersion(datasetRoot, normalizedLogicalPath, latest);
|
||||||
|
if (archivedPath != null) {
|
||||||
|
latest.setFilePath(archivedPath.toString());
|
||||||
|
} else if (Files.exists(targetFilePath)) {
|
||||||
|
log.error("Failed to archive latest file, refuse to overwrite. datasetId={}, fileId={}, logicalPath={}, targetPath={}",
|
||||||
|
dataset.getId(), latest.getId(), normalizedLogicalPath, targetFilePath);
|
||||||
|
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
|
||||||
|
}
|
||||||
|
latest.setStatus(FILE_STATUS_ARCHIVED);
|
||||||
|
datasetFileRepository.updateById(latest);
|
||||||
|
dataset.removeFile(latest);
|
||||||
|
} else if (latest == null && Files.exists(targetFilePath)) {
|
||||||
|
archiveOrphanTargetFile(datasetRoot, normalizedLogicalPath, targetFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
Files.createDirectories(targetFilePath.getParent());
|
||||||
|
if (moveIncoming) {
|
||||||
|
Files.move(incomingFilePath, targetFilePath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
} else {
|
||||||
|
Files.copy(incomingFilePath, targetFilePath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Failed to write dataset file, datasetId={}, logicalPath={}, targetPath={}",
|
||||||
|
dataset.getId(), normalizedLogicalPath, targetFilePath, e);
|
||||||
|
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
LocalDateTime currentTime = LocalDateTime.now();
|
LocalDateTime currentTime = LocalDateTime.now();
|
||||||
|
String fileName = targetFilePath.getFileName().toString();
|
||||||
|
long fileSize;
|
||||||
|
try {
|
||||||
|
fileSize = Files.size(targetFilePath);
|
||||||
|
} catch (IOException e) {
|
||||||
|
fileSize = 0L;
|
||||||
|
}
|
||||||
|
|
||||||
DatasetFile datasetFile = DatasetFile.builder()
|
DatasetFile datasetFile = DatasetFile.builder()
|
||||||
.id(UUID.randomUUID().toString())
|
.id(UUID.randomUUID().toString())
|
||||||
.datasetId(datasetId)
|
.datasetId(dataset.getId())
|
||||||
.fileSize(savedFile.length())
|
.fileName(fileName)
|
||||||
|
.fileType(AnalyzerUtils.getExtension(fileName))
|
||||||
|
.fileSize(fileSize)
|
||||||
|
.filePath(targetFilePath.toString())
|
||||||
|
.logicalPath(normalizedLogicalPath)
|
||||||
|
.version(nextVersion)
|
||||||
|
.status(FILE_STATUS_ACTIVE)
|
||||||
.uploadTime(currentTime)
|
.uploadTime(currentTime)
|
||||||
.lastAccessTime(currentTime)
|
.lastAccessTime(currentTime)
|
||||||
.fileName(file.getFileName())
|
|
||||||
.filePath(savedFile.getPath())
|
|
||||||
.fileType(AnalyzerUtils.getExtension(file.getFileName()))
|
|
||||||
.build();
|
.build();
|
||||||
setDatasetFileId(datasetFile, dataset);
|
|
||||||
datasetFileRepository.saveOrUpdate(datasetFile);
|
datasetFileRepository.saveOrUpdate(datasetFile);
|
||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
triggerPdfTextExtraction(dataset, datasetFile);
|
triggerPdfTextExtraction(dataset, datasetFile);
|
||||||
|
return datasetFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
private DuplicateMethod resolveEffectiveDuplicateMethod() {
|
||||||
|
if (duplicateMethod == null) {
|
||||||
|
return DuplicateMethod.VERSION;
|
||||||
|
}
|
||||||
|
if (duplicateMethod == DuplicateMethod.COVER) {
|
||||||
|
log.warn("duplicateMethod=COVER 会导致标注引用的 fileId 对应内容被覆盖,已强制按 VERSION 处理。");
|
||||||
|
return DuplicateMethod.VERSION;
|
||||||
|
}
|
||||||
|
return duplicateMethod;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path archiveDatasetFileVersion(Path datasetRoot, String logicalPath, DatasetFile latest) {
|
||||||
|
if (latest == null || latest.getId() == null || latest.getId().isBlank()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
Path currentPath;
|
||||||
|
try {
|
||||||
|
currentPath = Paths.get(latest.getFilePath()).toAbsolutePath().normalize();
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Invalid latest file path, skip archiving. datasetId={}, fileId={}, filePath={}",
|
||||||
|
latest.getDatasetId(), latest.getId(), latest.getFilePath());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!Files.exists(currentPath) || !Files.isRegularFile(currentPath)) {
|
||||||
|
log.warn("Latest file not found on disk, skip archiving. datasetId={}, fileId={}, filePath={}",
|
||||||
|
latest.getDatasetId(), latest.getId(), currentPath);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (!currentPath.startsWith(datasetRoot)) {
|
||||||
|
log.warn("Latest file path out of dataset root, skip archiving. datasetId={}, fileId={}, filePath={}",
|
||||||
|
latest.getDatasetId(), latest.getId(), currentPath);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
long latestVersion = Optional.ofNullable(latest.getVersion()).orElse(1L);
|
||||||
|
String logicalPathHash = sha256Hex(logicalPath);
|
||||||
|
Path archiveDir = datasetRoot
|
||||||
|
.resolve(INTERNAL_DIR_NAME)
|
||||||
|
.resolve(INTERNAL_VERSIONS_DIR_NAME)
|
||||||
|
.resolve(logicalPathHash)
|
||||||
|
.resolve("v" + latestVersion)
|
||||||
|
.toAbsolutePath()
|
||||||
|
.normalize();
|
||||||
|
BusinessAssert.isTrue(archiveDir.startsWith(datasetRoot), CommonErrorCode.PARAM_ERROR);
|
||||||
|
|
||||||
|
try {
|
||||||
|
Files.createDirectories(archiveDir);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Failed to create archive dir: {}", archiveDir, e);
|
||||||
|
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
String fileName = sanitizeArchiveFileName(Optional.ofNullable(latest.getFileName()).orElse(currentPath.getFileName().toString()));
|
||||||
|
Path archivedPath = archiveDir.resolve(latest.getId() + "__" + fileName).toAbsolutePath().normalize();
|
||||||
|
BusinessAssert.isTrue(archivedPath.startsWith(archiveDir), CommonErrorCode.PARAM_ERROR);
|
||||||
|
|
||||||
|
try {
|
||||||
|
Files.move(currentPath, archivedPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
return archivedPath;
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Failed to archive latest file, datasetId={}, fileId={}, from={}, to={}",
|
||||||
|
latest.getDatasetId(), latest.getId(), currentPath, archivedPath, e);
|
||||||
|
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void archiveOrphanTargetFile(Path datasetRoot, String logicalPath, Path targetFilePath) {
|
||||||
|
if (datasetRoot == null || targetFilePath == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!Files.exists(targetFilePath) || !Files.isRegularFile(targetFilePath)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String logicalPathHash = sha256Hex(logicalPath);
|
||||||
|
Path orphanDir = datasetRoot
|
||||||
|
.resolve(INTERNAL_DIR_NAME)
|
||||||
|
.resolve(INTERNAL_VERSIONS_DIR_NAME)
|
||||||
|
.resolve(logicalPathHash)
|
||||||
|
.resolve("orphan")
|
||||||
|
.toAbsolutePath()
|
||||||
|
.normalize();
|
||||||
|
if (!orphanDir.startsWith(datasetRoot)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
Files.createDirectories(orphanDir);
|
||||||
|
String safeName = sanitizeArchiveFileName(targetFilePath.getFileName().toString());
|
||||||
|
Path orphanPath = orphanDir.resolve("orphan_" + System.currentTimeMillis() + "__" + safeName)
|
||||||
|
.toAbsolutePath()
|
||||||
|
.normalize();
|
||||||
|
if (!orphanPath.startsWith(orphanDir)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Files.move(targetFilePath, orphanPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Failed to archive orphan target file, logicalPath={}, targetPath={}", logicalPath, targetFilePath, e);
|
||||||
}
|
}
|
||||||
dataset.active();
|
|
||||||
datasetRepository.updateById(dataset);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -570,11 +1044,16 @@ public class DatasetFileApplicationService {
|
|||||||
while (parentPrefix.startsWith("/")) {
|
while (parentPrefix.startsWith("/")) {
|
||||||
parentPrefix = parentPrefix.substring(1);
|
parentPrefix = parentPrefix.substring(1);
|
||||||
}
|
}
|
||||||
|
parentPrefix = normalizeLogicalPrefix(parentPrefix);
|
||||||
|
assertNotInternalPrefix(parentPrefix);
|
||||||
|
|
||||||
String directoryName = Optional.ofNullable(req.getDirectoryName()).orElse("").trim();
|
String directoryName = Optional.ofNullable(req.getDirectoryName()).orElse("").trim();
|
||||||
if (directoryName.isEmpty()) {
|
if (directoryName.isEmpty()) {
|
||||||
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
||||||
}
|
}
|
||||||
|
if (INTERNAL_DIR_NAME.equals(directoryName)) {
|
||||||
|
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
||||||
|
}
|
||||||
if (directoryName.contains("..") || directoryName.contains("/") || directoryName.contains("\\")) {
|
if (directoryName.contains("..") || directoryName.contains("/") || directoryName.contains("\\")) {
|
||||||
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
||||||
}
|
}
|
||||||
@@ -616,6 +1095,9 @@ public class DatasetFileApplicationService {
|
|||||||
while (prefix.endsWith("/")) {
|
while (prefix.endsWith("/")) {
|
||||||
prefix = prefix.substring(0, prefix.length() - 1);
|
prefix = prefix.substring(0, prefix.length() - 1);
|
||||||
}
|
}
|
||||||
|
if (prefix.equals(INTERNAL_DIR_NAME) || prefix.startsWith(INTERNAL_DIR_NAME + "/")) {
|
||||||
|
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
Path basePath = Paths.get(datasetPath);
|
Path basePath = Paths.get(datasetPath);
|
||||||
Path targetPath = prefix.isEmpty() ? basePath : basePath.resolve(prefix);
|
Path targetPath = prefix.isEmpty() ? basePath : basePath.resolve(prefix);
|
||||||
@@ -652,6 +1134,7 @@ public class DatasetFileApplicationService {
|
|||||||
private void zipDirectory(Path sourceDir, Path basePath, ZipArchiveOutputStream zipOut) throws IOException {
|
private void zipDirectory(Path sourceDir, Path basePath, ZipArchiveOutputStream zipOut) throws IOException {
|
||||||
try (Stream<Path> paths = Files.walk(sourceDir)) {
|
try (Stream<Path> paths = Files.walk(sourceDir)) {
|
||||||
paths.filter(path -> !Files.isDirectory(path))
|
paths.filter(path -> !Files.isDirectory(path))
|
||||||
|
.filter(path -> !isInternalDatasetPath(basePath.toAbsolutePath().normalize(), path))
|
||||||
.forEach(path -> {
|
.forEach(path -> {
|
||||||
try {
|
try {
|
||||||
Path relativePath = basePath.relativize(path);
|
Path relativePath = basePath.relativize(path);
|
||||||
@@ -690,6 +1173,9 @@ public class DatasetFileApplicationService {
|
|||||||
if (prefix.isEmpty()) {
|
if (prefix.isEmpty()) {
|
||||||
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
||||||
}
|
}
|
||||||
|
if (prefix.equals(INTERNAL_DIR_NAME) || prefix.startsWith(INTERNAL_DIR_NAME + "/")) {
|
||||||
|
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
String datasetPath = dataset.getPath();
|
String datasetPath = dataset.getPath();
|
||||||
Path basePath = Paths.get(datasetPath);
|
Path basePath = Paths.get(datasetPath);
|
||||||
@@ -761,28 +1247,6 @@ public class DatasetFileApplicationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* 为数据集文件设置文件id
|
|
||||||
*
|
|
||||||
* @param datasetFile 要设置id的文件
|
|
||||||
* @param dataset 数据集(包含文件列表)
|
|
||||||
*/
|
|
||||||
private void setDatasetFileId(DatasetFile datasetFile, Dataset dataset) {
|
|
||||||
Map<String, DatasetFile> existDatasetFilMap = dataset.getFiles().stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
|
|
||||||
DatasetFile existDatasetFile = existDatasetFilMap.get(datasetFile.getFilePath());
|
|
||||||
if (Objects.isNull(existDatasetFile)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (duplicateMethod == DuplicateMethod.ERROR) {
|
|
||||||
log.error("file {} already exists in dataset {}", datasetFile.getFileName(), datasetFile.getDatasetId());
|
|
||||||
throw BusinessException.of(DataManagementErrorCode.DATASET_FILE_ALREADY_EXISTS);
|
|
||||||
}
|
|
||||||
if (duplicateMethod == DuplicateMethod.COVER) {
|
|
||||||
dataset.removeFile(existDatasetFile);
|
|
||||||
datasetFile.setId(existDatasetFile.getId());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 复制文件到数据集目录
|
* 复制文件到数据集目录
|
||||||
*
|
*
|
||||||
@@ -794,36 +1258,21 @@ public class DatasetFileApplicationService {
|
|||||||
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
|
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
||||||
|
Path datasetRoot = resolveDatasetRootPath(dataset, datasetId);
|
||||||
|
dataset.setFiles(new ArrayList<>(datasetFileRepository.findAllVisibleByDatasetId(datasetId)));
|
||||||
List<DatasetFile> copiedFiles = new ArrayList<>();
|
List<DatasetFile> copiedFiles = new ArrayList<>();
|
||||||
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
|
||||||
dataset.setFiles(existDatasetFiles);
|
|
||||||
for (String sourceFilePath : req.sourcePaths()) {
|
for (String sourceFilePath : req.sourcePaths()) {
|
||||||
Path sourcePath = Paths.get(sourceFilePath);
|
Path sourcePath = Paths.get(sourceFilePath).toAbsolutePath().normalize();
|
||||||
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
|
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
|
||||||
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
|
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
String fileName = sourcePath.getFileName().toString();
|
String logicalPath = sourcePath.getFileName().toString();
|
||||||
File sourceFile = sourcePath.toFile();
|
DatasetFile datasetFile = commitNewFileVersion(dataset, datasetRoot, logicalPath, sourcePath, false);
|
||||||
LocalDateTime currentTime = LocalDateTime.now();
|
|
||||||
DatasetFile datasetFile = DatasetFile.builder()
|
|
||||||
.id(UUID.randomUUID().toString())
|
|
||||||
.datasetId(datasetId)
|
|
||||||
.fileName(fileName)
|
|
||||||
.fileType(AnalyzerUtils.getExtension(fileName))
|
|
||||||
.fileSize(sourceFile.length())
|
|
||||||
.filePath(Paths.get(dataset.getPath(), fileName).toString())
|
|
||||||
.uploadTime(currentTime)
|
|
||||||
.lastAccessTime(currentTime)
|
|
||||||
.build();
|
|
||||||
setDatasetFileId(datasetFile, dataset);
|
|
||||||
dataset.addFile(datasetFile);
|
|
||||||
copiedFiles.add(datasetFile);
|
copiedFiles.add(datasetFile);
|
||||||
}
|
}
|
||||||
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
|
|
||||||
return copiedFiles;
|
return copiedFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -839,13 +1288,11 @@ public class DatasetFileApplicationService {
|
|||||||
public List<DatasetFile> copyFilesToDatasetDirWithSourceRoot(String datasetId, Path sourceRoot, List<String> sourcePaths) {
|
public List<DatasetFile> copyFilesToDatasetDirWithSourceRoot(String datasetId, Path sourceRoot, List<String> sourcePaths) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
||||||
|
Path datasetRoot = resolveDatasetRootPath(dataset, datasetId);
|
||||||
Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
|
Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
|
||||||
List<DatasetFile> copiedFiles = new ArrayList<>();
|
dataset.setFiles(new ArrayList<>(datasetFileRepository.findAllVisibleByDatasetId(datasetId)));
|
||||||
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
|
||||||
dataset.setFiles(existDatasetFiles);
|
|
||||||
Map<String, DatasetFile> copyTargets = new LinkedHashMap<>();
|
|
||||||
|
|
||||||
|
List<DatasetFile> copiedFiles = new ArrayList<>();
|
||||||
for (String sourceFilePath : sourcePaths) {
|
for (String sourceFilePath : sourcePaths) {
|
||||||
if (sourceFilePath == null || sourceFilePath.isBlank()) {
|
if (sourceFilePath == null || sourceFilePath.isBlank()) {
|
||||||
continue;
|
continue;
|
||||||
@@ -859,86 +1306,16 @@ public class DatasetFileApplicationService {
|
|||||||
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
|
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
Path relativePath = normalizedRoot.relativize(sourcePath);
|
Path relativePath = normalizedRoot.relativize(sourcePath);
|
||||||
String fileName = sourcePath.getFileName().toString();
|
String logicalPath = relativePath.toString().replace("\\", "/");
|
||||||
File sourceFile = sourcePath.toFile();
|
DatasetFile datasetFile = commitNewFileVersion(dataset, datasetRoot, logicalPath, sourcePath, false);
|
||||||
LocalDateTime currentTime = LocalDateTime.now();
|
|
||||||
Path targetPath = Paths.get(dataset.getPath(), relativePath.toString());
|
|
||||||
|
|
||||||
DatasetFile datasetFile = DatasetFile.builder()
|
|
||||||
.id(UUID.randomUUID().toString())
|
|
||||||
.datasetId(datasetId)
|
|
||||||
.fileName(fileName)
|
|
||||||
.fileType(AnalyzerUtils.getExtension(fileName))
|
|
||||||
.fileSize(sourceFile.length())
|
|
||||||
.filePath(targetPath.toString())
|
|
||||||
.uploadTime(currentTime)
|
|
||||||
.lastAccessTime(currentTime)
|
|
||||||
.build();
|
|
||||||
setDatasetFileId(datasetFile, dataset);
|
|
||||||
dataset.addFile(datasetFile);
|
|
||||||
copiedFiles.add(datasetFile);
|
copiedFiles.add(datasetFile);
|
||||||
copyTargets.put(sourceFilePath, datasetFile);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (copiedFiles.isEmpty()) {
|
|
||||||
return copiedFiles;
|
|
||||||
}
|
|
||||||
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
CompletableFuture.runAsync(() -> copyFilesToDatasetDirWithRelativePath(copyTargets, dataset, normalizedRoot));
|
|
||||||
return copiedFiles;
|
return copiedFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
|
|
||||||
for (String sourcePath : sourcePaths) {
|
|
||||||
Path sourceFilePath = Paths.get(sourcePath);
|
|
||||||
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
|
|
||||||
try {
|
|
||||||
Files.createDirectories(Path.of(dataset.getPath()));
|
|
||||||
Files.copy(sourceFilePath, targetFilePath);
|
|
||||||
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
|
|
||||||
dataset.getId(),
|
|
||||||
sourceFilePath.getFileName().toString()
|
|
||||||
);
|
|
||||||
triggerPdfTextExtraction(dataset, datasetFile);
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void copyFilesToDatasetDirWithRelativePath(
|
|
||||||
Map<String, DatasetFile> copyTargets,
|
|
||||||
Dataset dataset,
|
|
||||||
Path sourceRoot
|
|
||||||
) {
|
|
||||||
Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize();
|
|
||||||
Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
|
|
||||||
for (Map.Entry<String, DatasetFile> entry : copyTargets.entrySet()) {
|
|
||||||
Path sourcePath = Paths.get(entry.getKey()).toAbsolutePath().normalize();
|
|
||||||
if (!sourcePath.startsWith(normalizedRoot)) {
|
|
||||||
log.warn("Source file path is out of root: {}", sourcePath);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Path relativePath = normalizedRoot.relativize(sourcePath);
|
|
||||||
Path targetFilePath = datasetRoot.resolve(relativePath).normalize();
|
|
||||||
if (!targetFilePath.startsWith(datasetRoot)) {
|
|
||||||
log.warn("Target file path is out of dataset path: {}", targetFilePath);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
Files.createDirectories(targetFilePath.getParent());
|
|
||||||
Files.copy(sourcePath, targetFilePath);
|
|
||||||
triggerPdfTextExtraction(dataset, entry.getValue());
|
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 添加文件到数据集(仅创建数据库记录,不执行文件系统操作)
|
* 添加文件到数据集(仅创建数据库记录,不执行文件系统操作)
|
||||||
*
|
*
|
||||||
@@ -951,8 +1328,7 @@ public class DatasetFileApplicationService {
|
|||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
||||||
List<DatasetFile> addedFiles = new ArrayList<>();
|
List<DatasetFile> addedFiles = new ArrayList<>();
|
||||||
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
dataset.setFiles(new ArrayList<>(datasetFileRepository.findAllVisibleByDatasetId(datasetId)));
|
||||||
dataset.setFiles(existDatasetFiles);
|
|
||||||
|
|
||||||
boolean softAdd = req.softAdd();
|
boolean softAdd = req.softAdd();
|
||||||
String metadata;
|
String metadata;
|
||||||
@@ -969,8 +1345,43 @@ public class DatasetFileApplicationService {
|
|||||||
Path sourcePath = Paths.get(sourceFilePath);
|
Path sourcePath = Paths.get(sourceFilePath);
|
||||||
String fileName = sourcePath.getFileName().toString();
|
String fileName = sourcePath.getFileName().toString();
|
||||||
File sourceFile = sourcePath.toFile();
|
File sourceFile = sourcePath.toFile();
|
||||||
LocalDateTime currentTime = LocalDateTime.now();
|
String logicalPath = normalizeLogicalPath(fileName);
|
||||||
|
assertNotInternalPrefix(logicalPath);
|
||||||
|
|
||||||
|
DatasetFile latest = datasetFileRepository.findLatestByDatasetIdAndLogicalPath(datasetId, logicalPath);
|
||||||
|
if (latest == null && dataset.getFiles() != null) {
|
||||||
|
latest = dataset.getFiles().stream()
|
||||||
|
.filter(existing -> existing != null
|
||||||
|
&& !isArchivedStatus(existing)
|
||||||
|
&& Objects.equals(existing.getFileName(), fileName))
|
||||||
|
.findFirst()
|
||||||
|
.orElse(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
DuplicateMethod effectiveDuplicateMethod = resolveEffectiveDuplicateMethod();
|
||||||
|
if (latest != null && effectiveDuplicateMethod == DuplicateMethod.ERROR) {
|
||||||
|
throw BusinessException.of(DataManagementErrorCode.DATASET_FILE_ALREADY_EXISTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
long nextVersion = 1L;
|
||||||
|
if (latest != null) {
|
||||||
|
long latestVersion = Optional.ofNullable(latest.getVersion()).orElse(1L);
|
||||||
|
if (latest.getVersion() == null) {
|
||||||
|
latest.setVersion(latestVersion);
|
||||||
|
}
|
||||||
|
if (latest.getLogicalPath() == null || latest.getLogicalPath().isBlank()) {
|
||||||
|
latest.setLogicalPath(logicalPath);
|
||||||
|
}
|
||||||
|
nextVersion = latestVersion + 1L;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (latest != null && effectiveDuplicateMethod == DuplicateMethod.VERSION) {
|
||||||
|
latest.setStatus(FILE_STATUS_ARCHIVED);
|
||||||
|
datasetFileRepository.updateById(latest);
|
||||||
|
dataset.removeFile(latest);
|
||||||
|
}
|
||||||
|
|
||||||
|
LocalDateTime currentTime = LocalDateTime.now();
|
||||||
DatasetFile datasetFile = DatasetFile.builder()
|
DatasetFile datasetFile = DatasetFile.builder()
|
||||||
.id(UUID.randomUUID().toString())
|
.id(UUID.randomUUID().toString())
|
||||||
.datasetId(datasetId)
|
.datasetId(datasetId)
|
||||||
@@ -978,16 +1389,19 @@ public class DatasetFileApplicationService {
|
|||||||
.fileType(AnalyzerUtils.getExtension(fileName))
|
.fileType(AnalyzerUtils.getExtension(fileName))
|
||||||
.fileSize(sourceFile.length())
|
.fileSize(sourceFile.length())
|
||||||
.filePath(sourceFilePath)
|
.filePath(sourceFilePath)
|
||||||
|
.logicalPath(logicalPath)
|
||||||
|
.version(nextVersion)
|
||||||
|
.status(FILE_STATUS_ACTIVE)
|
||||||
.uploadTime(currentTime)
|
.uploadTime(currentTime)
|
||||||
.lastAccessTime(currentTime)
|
.lastAccessTime(currentTime)
|
||||||
.metadata(metadata)
|
.metadata(metadata)
|
||||||
.build();
|
.build();
|
||||||
setDatasetFileId(datasetFile, dataset);
|
|
||||||
|
datasetFileRepository.saveOrUpdate(datasetFile);
|
||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
addedFiles.add(datasetFile);
|
addedFiles.add(datasetFile);
|
||||||
triggerPdfTextExtraction(dataset, datasetFile);
|
triggerPdfTextExtraction(dataset, datasetFile);
|
||||||
}
|
}
|
||||||
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
|
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
// Note: addFilesToDataset only creates DB records, no file system operations
|
// Note: addFilesToDataset only creates DB records, no file system operations
|
||||||
|
|||||||
@@ -178,6 +178,9 @@ public class KnowledgeItemApplicationService {
|
|||||||
if (request.getContentType() != null) {
|
if (request.getContentType() != null) {
|
||||||
knowledgeItem.setContentType(request.getContentType());
|
knowledgeItem.setContentType(request.getContentType());
|
||||||
}
|
}
|
||||||
|
if (request.getMetadata() != null) {
|
||||||
|
knowledgeItem.setMetadata(request.getMetadata());
|
||||||
|
}
|
||||||
|
|
||||||
knowledgeItemRepository.updateById(knowledgeItem);
|
knowledgeItemRepository.updateById(knowledgeItem);
|
||||||
return knowledgeItem;
|
return knowledgeItem;
|
||||||
|
|||||||
@@ -7,5 +7,6 @@ package com.datamate.datamanagement.common.enums;
|
|||||||
*/
|
*/
|
||||||
public enum DuplicateMethod {
|
public enum DuplicateMethod {
|
||||||
ERROR,
|
ERROR,
|
||||||
COVER
|
COVER,
|
||||||
|
VERSION
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -152,12 +152,20 @@ public class Dataset extends BaseEntity<String> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void removeFile(DatasetFile file) {
|
public void removeFile(DatasetFile file) {
|
||||||
if (this.files.remove(file)) {
|
if (file == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
boolean removed = this.files.remove(file);
|
||||||
|
if (!removed && file.getId() != null) {
|
||||||
|
removed = this.files.removeIf(existing -> Objects.equals(existing.getId(), file.getId()));
|
||||||
|
}
|
||||||
|
if (!removed) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
this.fileCount = Math.max(0, this.fileCount - 1);
|
this.fileCount = Math.max(0, this.fileCount - 1);
|
||||||
this.sizeBytes = Math.max(0, this.sizeBytes - (file.getFileSize() != null ? file.getFileSize() : 0L));
|
this.sizeBytes = Math.max(0, this.sizeBytes - (file.getFileSize() != null ? file.getFileSize() : 0L));
|
||||||
this.updatedAt = LocalDateTime.now();
|
this.updatedAt = LocalDateTime.now();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public void active() {
|
public void active() {
|
||||||
if (this.status == DatasetStatusType.DRAFT) {
|
if (this.status == DatasetStatusType.DRAFT) {
|
||||||
|
|||||||
@@ -28,12 +28,16 @@ public class DatasetFile {
|
|||||||
private String datasetId; // UUID
|
private String datasetId; // UUID
|
||||||
private String fileName;
|
private String fileName;
|
||||||
private String filePath;
|
private String filePath;
|
||||||
|
/** 文件逻辑路径(相对数据集根目录,包含子目录) */
|
||||||
|
private String logicalPath;
|
||||||
|
/** 文件版本号(同一个 logicalPath 下递增) */
|
||||||
|
private Long version;
|
||||||
private String fileType; // JPG/PNG/DCM/TXT
|
private String fileType; // JPG/PNG/DCM/TXT
|
||||||
private Long fileSize; // bytes
|
private Long fileSize; // bytes
|
||||||
private String checkSum;
|
private String checkSum;
|
||||||
private String tags;
|
private String tags;
|
||||||
private String metadata;
|
private String metadata;
|
||||||
private String status; // UPLOADED, PROCESSING, COMPLETED, ERROR
|
private String status; // ACTIVE/ARCHIVED/DELETED/PROCESSING...
|
||||||
private LocalDateTime uploadTime;
|
private LocalDateTime uploadTime;
|
||||||
private LocalDateTime lastAccessTime;
|
private LocalDateTime lastAccessTime;
|
||||||
private LocalDateTime createdAt;
|
private LocalDateTime createdAt;
|
||||||
|
|||||||
@@ -21,4 +21,7 @@ public class DatasetFileUploadCheckInfo {
|
|||||||
|
|
||||||
/** 目标子目录前缀,例如 "images/",为空表示数据集根目录 */
|
/** 目标子目录前缀,例如 "images/",为空表示数据集根目录 */
|
||||||
private String prefix;
|
private String prefix;
|
||||||
|
|
||||||
|
/** 上传临时落盘目录(仅服务端使用,不对外暴露) */
|
||||||
|
private String stagingPath;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,8 +24,19 @@ public interface DatasetFileRepository extends IRepository<DatasetFile> {
|
|||||||
|
|
||||||
List<DatasetFile> findAllByDatasetId(String datasetId);
|
List<DatasetFile> findAllByDatasetId(String datasetId);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 查询数据集内“可见文件”(默认不包含历史归档版本)。
|
||||||
|
* 约定:status 为 NULL 视为可见;status = ARCHIVED 视为历史版本。
|
||||||
|
*/
|
||||||
|
List<DatasetFile> findAllVisibleByDatasetId(String datasetId);
|
||||||
|
|
||||||
DatasetFile findByDatasetIdAndFileName(String datasetId, String fileName);
|
DatasetFile findByDatasetIdAndFileName(String datasetId, String fileName);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 查询指定逻辑路径的最新版本(ACTIVE/NULL)。
|
||||||
|
*/
|
||||||
|
DatasetFile findLatestByDatasetIdAndLogicalPath(String datasetId, String logicalPath);
|
||||||
|
|
||||||
IPage<DatasetFile> findByCriteria(String datasetId, String fileType, String status, String name,
|
IPage<DatasetFile> findByCriteria(String datasetId, String fileType, String status, String name,
|
||||||
Boolean hasAnnotation, IPage<DatasetFile> page);
|
Boolean hasAnnotation, IPage<DatasetFile> page);
|
||||||
|
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ public class DatasetFileRepositoryImpl extends CrudRepository<DatasetFileMapper,
|
|||||||
private final DatasetFileMapper datasetFileMapper;
|
private final DatasetFileMapper datasetFileMapper;
|
||||||
private static final String ANNOTATION_EXISTS_SQL =
|
private static final String ANNOTATION_EXISTS_SQL =
|
||||||
"SELECT 1 FROM t_dm_annotation_results ar WHERE ar.file_id = t_dm_dataset_files.id";
|
"SELECT 1 FROM t_dm_annotation_results ar WHERE ar.file_id = t_dm_dataset_files.id";
|
||||||
|
private static final String FILE_STATUS_ARCHIVED = "ARCHIVED";
|
||||||
|
private static final String FILE_STATUS_ACTIVE = "ACTIVE";
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Long countByDatasetId(String datasetId) {
|
public Long countByDatasetId(String datasetId) {
|
||||||
@@ -51,19 +53,54 @@ public class DatasetFileRepositoryImpl extends CrudRepository<DatasetFileMapper,
|
|||||||
return datasetFileMapper.findAllByDatasetId(datasetId);
|
return datasetFileMapper.findAllByDatasetId(datasetId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<DatasetFile> findAllVisibleByDatasetId(String datasetId) {
|
||||||
|
return datasetFileMapper.selectList(new LambdaQueryWrapper<DatasetFile>()
|
||||||
|
.eq(DatasetFile::getDatasetId, datasetId)
|
||||||
|
.and(wrapper -> wrapper.isNull(DatasetFile::getStatus)
|
||||||
|
.or()
|
||||||
|
.ne(DatasetFile::getStatus, FILE_STATUS_ARCHIVED))
|
||||||
|
.orderByDesc(DatasetFile::getUploadTime));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DatasetFile findByDatasetIdAndFileName(String datasetId, String fileName) {
|
public DatasetFile findByDatasetIdAndFileName(String datasetId, String fileName) {
|
||||||
return datasetFileMapper.findByDatasetIdAndFileName(datasetId, fileName);
|
return datasetFileMapper.findByDatasetIdAndFileName(datasetId, fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DatasetFile findLatestByDatasetIdAndLogicalPath(String datasetId, String logicalPath) {
|
||||||
|
if (!StringUtils.hasText(datasetId) || !StringUtils.hasText(logicalPath)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return datasetFileMapper.selectOne(new LambdaQueryWrapper<DatasetFile>()
|
||||||
|
.eq(DatasetFile::getDatasetId, datasetId)
|
||||||
|
.eq(DatasetFile::getLogicalPath, logicalPath)
|
||||||
|
.and(wrapper -> wrapper.isNull(DatasetFile::getStatus)
|
||||||
|
.or()
|
||||||
|
.eq(DatasetFile::getStatus, FILE_STATUS_ACTIVE))
|
||||||
|
.orderByDesc(DatasetFile::getVersion)
|
||||||
|
.orderByDesc(DatasetFile::getUploadTime)
|
||||||
|
.last("LIMIT 1"));
|
||||||
|
}
|
||||||
|
|
||||||
public IPage<DatasetFile> findByCriteria(String datasetId, String fileType, String status, String name,
|
public IPage<DatasetFile> findByCriteria(String datasetId, String fileType, String status, String name,
|
||||||
Boolean hasAnnotation, IPage<DatasetFile> page) {
|
Boolean hasAnnotation, IPage<DatasetFile> page) {
|
||||||
return datasetFileMapper.selectPage(page, new LambdaQueryWrapper<DatasetFile>()
|
LambdaQueryWrapper<DatasetFile> wrapper = new LambdaQueryWrapper<DatasetFile>()
|
||||||
.eq(DatasetFile::getDatasetId, datasetId)
|
.eq(DatasetFile::getDatasetId, datasetId)
|
||||||
.eq(StringUtils.hasText(fileType), DatasetFile::getFileType, fileType)
|
.eq(StringUtils.hasText(fileType), DatasetFile::getFileType, fileType)
|
||||||
.eq(StringUtils.hasText(status), DatasetFile::getStatus, status)
|
|
||||||
.like(StringUtils.hasText(name), DatasetFile::getFileName, name)
|
.like(StringUtils.hasText(name), DatasetFile::getFileName, name)
|
||||||
.exists(Boolean.TRUE.equals(hasAnnotation), ANNOTATION_EXISTS_SQL));
|
.exists(Boolean.TRUE.equals(hasAnnotation), ANNOTATION_EXISTS_SQL);
|
||||||
|
|
||||||
|
if (StringUtils.hasText(status)) {
|
||||||
|
wrapper.eq(DatasetFile::getStatus, status);
|
||||||
|
} else {
|
||||||
|
wrapper.and(visibility -> visibility.isNull(DatasetFile::getStatus)
|
||||||
|
.or()
|
||||||
|
.ne(DatasetFile::getStatus, FILE_STATUS_ARCHIVED));
|
||||||
|
}
|
||||||
|
|
||||||
|
return datasetFileMapper.selectPage(page, wrapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -34,4 +34,8 @@ public class CreateKnowledgeItemRequest {
|
|||||||
* 来源文件ID(用于标注同步等场景)
|
* 来源文件ID(用于标注同步等场景)
|
||||||
*/
|
*/
|
||||||
private String sourceFileId;
|
private String sourceFileId;
|
||||||
|
/**
|
||||||
|
* 扩展元数据
|
||||||
|
*/
|
||||||
|
private String metadata;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
package com.datamate.datamanagement.interfaces.dto;
|
package com.datamate.datamanagement.interfaces.dto;
|
||||||
|
|
||||||
import com.datamate.datamanagement.common.enums.DatasetStatusType;
|
import com.datamate.datamanagement.common.enums.DatasetStatusType;
|
||||||
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import jakarta.validation.constraints.NotBlank;
|
import jakarta.validation.constraints.NotBlank;
|
||||||
import jakarta.validation.constraints.Size;
|
import jakarta.validation.constraints.Size;
|
||||||
|
import lombok.AccessLevel;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.Setter;
|
import lombok.Setter;
|
||||||
|
|
||||||
@@ -24,9 +26,18 @@ public class UpdateDatasetRequest {
|
|||||||
/** 归集任务id */
|
/** 归集任务id */
|
||||||
private String dataSource;
|
private String dataSource;
|
||||||
/** 父数据集ID */
|
/** 父数据集ID */
|
||||||
|
@Setter(AccessLevel.NONE)
|
||||||
private String parentDatasetId;
|
private String parentDatasetId;
|
||||||
|
@JsonIgnore
|
||||||
|
@Setter(AccessLevel.NONE)
|
||||||
|
private boolean parentDatasetIdProvided;
|
||||||
/** 标签列表 */
|
/** 标签列表 */
|
||||||
private List<String> tags;
|
private List<String> tags;
|
||||||
/** 数据集状态 */
|
/** 数据集状态 */
|
||||||
private DatasetStatusType status;
|
private DatasetStatusType status;
|
||||||
|
|
||||||
|
public void setParentDatasetId(String parentDatasetId) {
|
||||||
|
this.parentDatasetIdProvided = true;
|
||||||
|
this.parentDatasetId = parentDatasetId;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,4 +18,8 @@ public class UpdateKnowledgeItemRequest {
|
|||||||
* 内容类型
|
* 内容类型
|
||||||
*/
|
*/
|
||||||
private KnowledgeContentType contentType;
|
private KnowledgeContentType contentType;
|
||||||
|
/**
|
||||||
|
* 扩展元数据
|
||||||
|
*/
|
||||||
|
private String metadata;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,33 @@
|
|||||||
|
package com.datamate.datamanagement.interfaces.rest;
|
||||||
|
|
||||||
|
import com.datamate.datamanagement.application.DatasetFileApplicationService;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.http.ResponseEntity;
|
||||||
|
import org.springframework.web.bind.annotation.PathVariable;
|
||||||
|
import org.springframework.web.bind.annotation.PutMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 数据集上传控制器
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RestController
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@RequestMapping("/data-management/datasets/upload")
|
||||||
|
public class DatasetUploadController {
|
||||||
|
|
||||||
|
private final DatasetFileApplicationService datasetFileApplicationService;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 取消上传
|
||||||
|
*
|
||||||
|
* @param reqId 预上传请求ID
|
||||||
|
*/
|
||||||
|
@PutMapping("/cancel-upload/{reqId}")
|
||||||
|
public ResponseEntity<Void> cancelUpload(@PathVariable("reqId") String reqId) {
|
||||||
|
datasetFileApplicationService.cancelUpload(reqId);
|
||||||
|
return ResponseEntity.ok().build();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
|
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
|
||||||
<mapper namespace="com.datamate.datamanagement.infrastructure.persistence.mapper.DatasetFileMapper">
|
<mapper namespace="com.datamate.datamanagement.infrastructure.persistence.mapper.DatasetFileMapper">
|
||||||
<sql id="Base_Column_List">
|
<sql id="Base_Column_List">
|
||||||
id, dataset_id, file_name, file_path, file_type, file_size, check_sum, tags, metadata, status,
|
id, dataset_id, file_name, file_path, logical_path, version, file_type, file_size, check_sum, tags, metadata, status,
|
||||||
upload_time, last_access_time, created_at, updated_at
|
upload_time, last_access_time, created_at, updated_at
|
||||||
</sql>
|
</sql>
|
||||||
|
|
||||||
@@ -39,13 +39,17 @@
|
|||||||
</select>
|
</select>
|
||||||
|
|
||||||
<select id="countByDatasetId" parameterType="string" resultType="long">
|
<select id="countByDatasetId" parameterType="string" resultType="long">
|
||||||
SELECT COUNT(*) FROM t_dm_dataset_files WHERE dataset_id = #{datasetId}
|
SELECT COUNT(*)
|
||||||
|
FROM t_dm_dataset_files
|
||||||
|
WHERE dataset_id = #{datasetId}
|
||||||
|
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||||
</select>
|
</select>
|
||||||
|
|
||||||
<select id="countNonDerivedByDatasetId" parameterType="string" resultType="long">
|
<select id="countNonDerivedByDatasetId" parameterType="string" resultType="long">
|
||||||
SELECT COUNT(*)
|
SELECT COUNT(*)
|
||||||
FROM t_dm_dataset_files
|
FROM t_dm_dataset_files
|
||||||
WHERE dataset_id = #{datasetId}
|
WHERE dataset_id = #{datasetId}
|
||||||
|
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||||
AND (metadata IS NULL OR JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NULL)
|
AND (metadata IS NULL OR JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NULL)
|
||||||
</select>
|
</select>
|
||||||
|
|
||||||
@@ -54,13 +58,19 @@
|
|||||||
</select>
|
</select>
|
||||||
|
|
||||||
<select id="sumSizeByDatasetId" parameterType="string" resultType="long">
|
<select id="sumSizeByDatasetId" parameterType="string" resultType="long">
|
||||||
SELECT COALESCE(SUM(file_size), 0) FROM t_dm_dataset_files WHERE dataset_id = #{datasetId}
|
SELECT COALESCE(SUM(file_size), 0)
|
||||||
|
FROM t_dm_dataset_files
|
||||||
|
WHERE dataset_id = #{datasetId}
|
||||||
|
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||||
</select>
|
</select>
|
||||||
|
|
||||||
<select id="findByDatasetIdAndFileName" resultType="com.datamate.datamanagement.domain.model.dataset.DatasetFile">
|
<select id="findByDatasetIdAndFileName" resultType="com.datamate.datamanagement.domain.model.dataset.DatasetFile">
|
||||||
SELECT <include refid="Base_Column_List"/>
|
SELECT <include refid="Base_Column_List"/>
|
||||||
FROM t_dm_dataset_files
|
FROM t_dm_dataset_files
|
||||||
WHERE dataset_id = #{datasetId} AND file_name = #{fileName}
|
WHERE dataset_id = #{datasetId}
|
||||||
|
AND file_name = #{fileName}
|
||||||
|
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||||
|
ORDER BY version DESC, upload_time DESC
|
||||||
LIMIT 1
|
LIMIT 1
|
||||||
</select>
|
</select>
|
||||||
|
|
||||||
@@ -91,6 +101,8 @@
|
|||||||
UPDATE t_dm_dataset_files
|
UPDATE t_dm_dataset_files
|
||||||
SET file_name = #{fileName},
|
SET file_name = #{fileName},
|
||||||
file_path = #{filePath},
|
file_path = #{filePath},
|
||||||
|
logical_path = #{logicalPath},
|
||||||
|
version = #{version},
|
||||||
file_type = #{fileType},
|
file_type = #{fileType},
|
||||||
file_size = #{fileSize},
|
file_size = #{fileSize},
|
||||||
upload_time = #{uploadTime},
|
upload_time = #{uploadTime},
|
||||||
@@ -126,6 +138,7 @@
|
|||||||
<foreach collection="datasetIds" item="datasetId" open="(" separator="," close=")">
|
<foreach collection="datasetIds" item="datasetId" open="(" separator="," close=")">
|
||||||
#{datasetId}
|
#{datasetId}
|
||||||
</foreach>
|
</foreach>
|
||||||
|
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||||
AND (metadata IS NULL OR JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NULL)
|
AND (metadata IS NULL OR JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NULL)
|
||||||
GROUP BY dataset_id
|
GROUP BY dataset_id
|
||||||
</select>
|
</select>
|
||||||
|
|||||||
@@ -0,0 +1,147 @@
|
|||||||
|
package com.datamate.datamanagement.application;
|
||||||
|
|
||||||
|
import com.datamate.common.domain.service.FileService;
|
||||||
|
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||||
|
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||||
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
||||||
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
|
import org.mockito.ArgumentCaptor;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.security.MessageDigest;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.mockito.ArgumentMatchers.anyString;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
class DatasetFileApplicationServiceVersioningTest {
|
||||||
|
|
||||||
|
@TempDir
|
||||||
|
Path tempDir;
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
DatasetFileRepository datasetFileRepository;
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
DatasetRepository datasetRepository;
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
FileService fileService;
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
PdfTextExtractAsyncService pdfTextExtractAsyncService;
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
DatasetFilePreviewService datasetFilePreviewService;
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void copyFilesToDatasetDirWithSourceRoot_shouldArchiveOldFileAndCreateNewVersionWhenDuplicateLogicalPath()
|
||||||
|
throws Exception {
|
||||||
|
String datasetId = "dataset-1";
|
||||||
|
|
||||||
|
Path datasetRoot = tempDir.resolve("dataset-root");
|
||||||
|
Files.createDirectories(datasetRoot);
|
||||||
|
|
||||||
|
Path sourceRoot = tempDir.resolve("source-root");
|
||||||
|
Files.createDirectories(sourceRoot);
|
||||||
|
|
||||||
|
Path existingPath = datasetRoot.resolve("a.txt");
|
||||||
|
Files.writeString(existingPath, "old-content", StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
Path incomingPath = sourceRoot.resolve("a.txt");
|
||||||
|
Files.writeString(incomingPath, "new-content", StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
Dataset dataset = new Dataset();
|
||||||
|
dataset.setId(datasetId);
|
||||||
|
dataset.setPath(datasetRoot.toString());
|
||||||
|
|
||||||
|
DatasetFile oldRecord = DatasetFile.builder()
|
||||||
|
.id("old-file-id")
|
||||||
|
.datasetId(datasetId)
|
||||||
|
.fileName("a.txt")
|
||||||
|
.filePath(existingPath.toString())
|
||||||
|
.logicalPath(null)
|
||||||
|
.version(null)
|
||||||
|
.status(null)
|
||||||
|
.fileSize(Files.size(existingPath))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
when(datasetRepository.getById(datasetId)).thenReturn(dataset);
|
||||||
|
when(datasetFileRepository.findAllVisibleByDatasetId(datasetId)).thenReturn(List.of(oldRecord));
|
||||||
|
when(datasetFileRepository.findLatestByDatasetIdAndLogicalPath(anyString(), anyString())).thenReturn(null);
|
||||||
|
|
||||||
|
DatasetFileApplicationService service = new DatasetFileApplicationService(
|
||||||
|
datasetFileRepository,
|
||||||
|
datasetRepository,
|
||||||
|
fileService,
|
||||||
|
pdfTextExtractAsyncService,
|
||||||
|
datasetFilePreviewService
|
||||||
|
);
|
||||||
|
|
||||||
|
List<DatasetFile> copied = service.copyFilesToDatasetDirWithSourceRoot(
|
||||||
|
datasetId,
|
||||||
|
sourceRoot,
|
||||||
|
List.of(incomingPath.toString())
|
||||||
|
);
|
||||||
|
|
||||||
|
assertThat(copied).hasSize(1);
|
||||||
|
assertThat(Files.readString(existingPath, StandardCharsets.UTF_8)).isEqualTo("new-content");
|
||||||
|
|
||||||
|
String logicalPathHash = sha256Hex("a.txt");
|
||||||
|
Path archivedPath = datasetRoot
|
||||||
|
.resolve(".datamate")
|
||||||
|
.resolve("versions")
|
||||||
|
.resolve(logicalPathHash)
|
||||||
|
.resolve("v1")
|
||||||
|
.resolve("old-file-id__a.txt")
|
||||||
|
.toAbsolutePath()
|
||||||
|
.normalize();
|
||||||
|
|
||||||
|
assertThat(Files.exists(archivedPath)).isTrue();
|
||||||
|
assertThat(Files.readString(archivedPath, StandardCharsets.UTF_8)).isEqualTo("old-content");
|
||||||
|
|
||||||
|
ArgumentCaptor<DatasetFile> archivedCaptor = ArgumentCaptor.forClass(DatasetFile.class);
|
||||||
|
verify(datasetFileRepository).updateById(archivedCaptor.capture());
|
||||||
|
DatasetFile archivedRecord = archivedCaptor.getValue();
|
||||||
|
assertThat(archivedRecord.getId()).isEqualTo("old-file-id");
|
||||||
|
assertThat(archivedRecord.getStatus()).isEqualTo("ARCHIVED");
|
||||||
|
assertThat(archivedRecord.getLogicalPath()).isEqualTo("a.txt");
|
||||||
|
assertThat(archivedRecord.getVersion()).isEqualTo(1L);
|
||||||
|
assertThat(Paths.get(archivedRecord.getFilePath()).toAbsolutePath().normalize()).isEqualTo(archivedPath);
|
||||||
|
|
||||||
|
ArgumentCaptor<DatasetFile> createdCaptor = ArgumentCaptor.forClass(DatasetFile.class);
|
||||||
|
verify(datasetFileRepository).saveOrUpdate(createdCaptor.capture());
|
||||||
|
DatasetFile newRecord = createdCaptor.getValue();
|
||||||
|
assertThat(newRecord.getId()).isNotEqualTo("old-file-id");
|
||||||
|
assertThat(newRecord.getStatus()).isEqualTo("ACTIVE");
|
||||||
|
assertThat(newRecord.getLogicalPath()).isEqualTo("a.txt");
|
||||||
|
assertThat(newRecord.getVersion()).isEqualTo(2L);
|
||||||
|
assertThat(Paths.get(newRecord.getFilePath()).toAbsolutePath().normalize()).isEqualTo(existingPath.toAbsolutePath().normalize());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String sha256Hex(String value) {
|
||||||
|
try {
|
||||||
|
MessageDigest digest = MessageDigest.getInstance("SHA-256");
|
||||||
|
byte[] hashed = digest.digest((value == null ? "" : value).getBytes(StandardCharsets.UTF_8));
|
||||||
|
StringBuilder builder = new StringBuilder(hashed.length * 2);
|
||||||
|
for (byte b : hashed) {
|
||||||
|
builder.append(String.format("%02x", b));
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
} catch (Exception e) {
|
||||||
|
return Integer.toHexString((value == null ? "" : value).hashCode());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -74,6 +74,26 @@ public class FileService {
|
|||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 取消上传
|
||||||
|
*/
|
||||||
|
@Transactional
|
||||||
|
public void cancelUpload(String reqId) {
|
||||||
|
if (reqId == null || reqId.isBlank()) {
|
||||||
|
throw BusinessException.of(CommonErrorCode.PARAM_ERROR);
|
||||||
|
}
|
||||||
|
ChunkUploadPreRequest preRequest = chunkUploadRequestMapper.findById(reqId);
|
||||||
|
if (preRequest == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String uploadPath = preRequest.getUploadPath();
|
||||||
|
if (uploadPath != null && !uploadPath.isBlank()) {
|
||||||
|
File tempDir = new File(uploadPath, String.format(ChunksSaver.TEMP_DIR_NAME_FORMAT, preRequest.getId()));
|
||||||
|
ChunksSaver.deleteFolder(tempDir.getPath());
|
||||||
|
}
|
||||||
|
chunkUploadRequestMapper.deleteById(reqId);
|
||||||
|
}
|
||||||
|
|
||||||
private File uploadFile(ChunkUploadRequest fileUploadRequest, ChunkUploadPreRequest preRequest) {
|
private File uploadFile(ChunkUploadRequest fileUploadRequest, ChunkUploadPreRequest preRequest) {
|
||||||
File savedFile = ChunksSaver.saveFile(fileUploadRequest, preRequest);
|
File savedFile = ChunksSaver.saveFile(fileUploadRequest, preRequest);
|
||||||
preRequest.setTimeout(LocalDateTime.now().plusSeconds(DEFAULT_TIMEOUT));
|
preRequest.setTimeout(LocalDateTime.now().plusSeconds(DEFAULT_TIMEOUT));
|
||||||
|
|||||||
@@ -143,7 +143,20 @@ public class ArchiveAnalyzer {
|
|||||||
private static Optional<FileUploadResult> extractEntity(ArchiveInputStream<?> archiveInputStream, ArchiveEntry archiveEntry, Path archivePath)
|
private static Optional<FileUploadResult> extractEntity(ArchiveInputStream<?> archiveInputStream, ArchiveEntry archiveEntry, Path archivePath)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
byte[] buffer = new byte[DEFAULT_BUFFER_SIZE];
|
byte[] buffer = new byte[DEFAULT_BUFFER_SIZE];
|
||||||
Path path = Paths.get(archivePath.getParent().toString(), archiveEntry.getName());
|
Path archiveRoot = archivePath.getParent().toAbsolutePath().normalize();
|
||||||
|
String entryName = archiveEntry.getName();
|
||||||
|
if (entryName == null || entryName.isBlank()) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
entryName = entryName.replace("\\", "/");
|
||||||
|
while (entryName.startsWith("/")) {
|
||||||
|
entryName = entryName.substring(1);
|
||||||
|
}
|
||||||
|
Path path = archiveRoot.resolve(entryName).normalize();
|
||||||
|
if (!path.startsWith(archiveRoot)) {
|
||||||
|
log.warn("Skip unsafe archive entry path traversal: {}", archiveEntry.getName());
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
File file = path.toFile();
|
File file = path.toFile();
|
||||||
long fileSize = 0L;
|
long fileSize = 0L;
|
||||||
FileUtils.createParentDirectories(file);
|
FileUtils.createParentDirectories(file);
|
||||||
|
|||||||
@@ -13,7 +13,10 @@ public class CommonUtils {
|
|||||||
* @return 文件名(带后缀)
|
* @return 文件名(带后缀)
|
||||||
*/
|
*/
|
||||||
public static String trimFilePath(String filePath) {
|
public static String trimFilePath(String filePath) {
|
||||||
int lastSlashIndex = filePath.lastIndexOf(File.separator);
|
if (filePath == null || filePath.isBlank()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
int lastSlashIndex = Math.max(filePath.lastIndexOf('/'), filePath.lastIndexOf('\\'));
|
||||||
|
|
||||||
String filename = filePath;
|
String filename = filePath;
|
||||||
if (lastSlashIndex != -1) {
|
if (lastSlashIndex != -1) {
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ server {
|
|||||||
access_log /var/log/datamate/frontend/access.log main;
|
access_log /var/log/datamate/frontend/access.log main;
|
||||||
error_log /var/log/datamate/frontend/error.log notice;
|
error_log /var/log/datamate/frontend/error.log notice;
|
||||||
|
|
||||||
client_max_body_size 1024M;
|
client_max_body_size 0;
|
||||||
|
|
||||||
add_header Set-Cookie "NEXT_LOCALE=zh";
|
add_header Set-Cookie "NEXT_LOCALE=zh";
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
import { Button, Input, Popover, theme, Tag, Empty } from "antd";
|
import { Button, Input, Popover, theme, Tag, Empty } from "antd";
|
||||||
import { PlusOutlined } from "@ant-design/icons";
|
import { PlusOutlined } from "@ant-design/icons";
|
||||||
import { useEffect, useMemo, useState } from "react";
|
import { useCallback, useEffect, useMemo, useState } from "react";
|
||||||
|
|
||||||
interface Tag {
|
interface Tag {
|
||||||
id: number;
|
id?: string | number;
|
||||||
name: string;
|
name: string;
|
||||||
color: string;
|
color?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface AddTagPopoverProps {
|
interface AddTagPopoverProps {
|
||||||
tags: Tag[];
|
tags: Tag[];
|
||||||
onFetchTags?: () => Promise<Tag[]>;
|
onFetchTags?: () => Promise<Tag[]>;
|
||||||
onAddTag?: (tag: Tag) => void;
|
onAddTag?: (tagName: string) => void;
|
||||||
onCreateAndTag?: (tagName: string) => void;
|
onCreateAndTag?: (tagName: string) => void;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -27,20 +27,23 @@ export default function AddTagPopover({
|
|||||||
const [newTag, setNewTag] = useState("");
|
const [newTag, setNewTag] = useState("");
|
||||||
const [allTags, setAllTags] = useState<Tag[]>([]);
|
const [allTags, setAllTags] = useState<Tag[]>([]);
|
||||||
|
|
||||||
const tagsSet = useMemo(() => new Set(tags.map((tag) => tag.id)), [tags]);
|
const tagsSet = useMemo(
|
||||||
|
() => new Set(tags.map((tag) => (tag.id ?? tag.name))),
|
||||||
|
[tags]
|
||||||
|
);
|
||||||
|
|
||||||
const fetchTags = async () => {
|
const fetchTags = useCallback(async () => {
|
||||||
if (onFetchTags && showPopover) {
|
if (onFetchTags && showPopover) {
|
||||||
const data = await onFetchTags?.();
|
const data = await onFetchTags?.();
|
||||||
setAllTags(data || []);
|
setAllTags(data || []);
|
||||||
}
|
}
|
||||||
};
|
}, [onFetchTags, showPopover]);
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
fetchTags();
|
fetchTags();
|
||||||
}, [showPopover]);
|
}, [fetchTags]);
|
||||||
|
|
||||||
const availableTags = useMemo(() => {
|
const availableTags = useMemo(() => {
|
||||||
return allTags.filter((tag) => !tagsSet.has(tag.id));
|
return allTags.filter((tag) => !tagsSet.has(tag.id ?? tag.name));
|
||||||
}, [allTags, tagsSet]);
|
}, [allTags, tagsSet]);
|
||||||
|
|
||||||
const handleCreateAndAddTag = () => {
|
const handleCreateAndAddTag = () => {
|
||||||
|
|||||||
@@ -24,21 +24,28 @@ interface OperationItem {
|
|||||||
|
|
||||||
interface TagConfig {
|
interface TagConfig {
|
||||||
showAdd: boolean;
|
showAdd: boolean;
|
||||||
tags: { id: number; name: string; color: string }[];
|
tags: { id?: string | number; name: string; color?: string }[];
|
||||||
onFetchTags?: () => Promise<{
|
onFetchTags?: () => Promise<{ id?: string | number; name: string; color?: string }[]>;
|
||||||
data: { id: number; name: string; color: string }[];
|
onAddTag?: (tagName: string) => void;
|
||||||
}>;
|
|
||||||
onAddTag?: (tag: { id: number; name: string; color: string }) => void;
|
|
||||||
onCreateAndTag?: (tagName: string) => void;
|
onCreateAndTag?: (tagName: string) => void;
|
||||||
}
|
}
|
||||||
interface DetailHeaderProps<T> {
|
interface DetailHeaderData {
|
||||||
|
name?: string;
|
||||||
|
description?: string;
|
||||||
|
status?: { color?: string; icon?: React.ReactNode; label?: string };
|
||||||
|
tags?: { id?: string | number; name?: string }[];
|
||||||
|
icon?: React.ReactNode;
|
||||||
|
iconColor?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface DetailHeaderProps<T extends DetailHeaderData> {
|
||||||
data: T;
|
data: T;
|
||||||
statistics: StatisticItem[];
|
statistics: StatisticItem[];
|
||||||
operations: OperationItem[];
|
operations: OperationItem[];
|
||||||
tagConfig?: TagConfig;
|
tagConfig?: TagConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
function DetailHeader<T>({
|
function DetailHeader<T extends DetailHeaderData>({
|
||||||
data = {} as T,
|
data = {} as T,
|
||||||
statistics,
|
statistics,
|
||||||
operations,
|
operations,
|
||||||
@@ -50,13 +57,13 @@ function DetailHeader<T>({
|
|||||||
<div className="flex items-start gap-4 flex-1">
|
<div className="flex items-start gap-4 flex-1">
|
||||||
<div
|
<div
|
||||||
className={`w-16 h-16 text-white rounded-lg flex-center shadow-lg ${
|
className={`w-16 h-16 text-white rounded-lg flex-center shadow-lg ${
|
||||||
(data as any)?.iconColor
|
data?.iconColor
|
||||||
? ""
|
? ""
|
||||||
: "bg-gradient-to-br from-sky-300 to-blue-500 text-white"
|
: "bg-gradient-to-br from-sky-300 to-blue-500 text-white"
|
||||||
}`}
|
}`}
|
||||||
style={(data as any)?.iconColor ? { backgroundColor: (data as any).iconColor } : undefined}
|
style={data?.iconColor ? { backgroundColor: data.iconColor } : undefined}
|
||||||
>
|
>
|
||||||
{<div className="w-[2.8rem] h-[2.8rem] text-gray-50">{(data as any)?.icon}</div> || (
|
{<div className="w-[2.8rem] h-[2.8rem] text-gray-50">{data?.icon}</div> || (
|
||||||
<Database className="w-8 h-8 text-white" />
|
<Database className="w-8 h-8 text-white" />
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { TaskItem } from "@/pages/DataManagement/dataset.model";
|
import { TaskItem } from "@/pages/DataManagement/dataset.model";
|
||||||
import { calculateSHA256, checkIsFilesExist } from "@/utils/file.util";
|
import { calculateSHA256, checkIsFilesExist, streamSplitAndUpload, StreamUploadResult } from "@/utils/file.util";
|
||||||
import { App } from "antd";
|
import { App } from "antd";
|
||||||
import { useRef, useState } from "react";
|
import { useRef, useState } from "react";
|
||||||
|
|
||||||
@@ -9,17 +9,18 @@ export function useFileSliceUpload(
|
|||||||
uploadChunk,
|
uploadChunk,
|
||||||
cancelUpload,
|
cancelUpload,
|
||||||
}: {
|
}: {
|
||||||
preUpload: (id: string, params: any) => Promise<{ data: number }>;
|
preUpload: (id: string, params: Record<string, unknown>) => Promise<{ data: number }>;
|
||||||
uploadChunk: (id: string, formData: FormData, config: any) => Promise<any>;
|
uploadChunk: (id: string, formData: FormData, config: Record<string, unknown>) => Promise<unknown>;
|
||||||
cancelUpload: ((reqId: number) => Promise<any>) | null;
|
cancelUpload: ((reqId: number) => Promise<unknown>) | null;
|
||||||
},
|
},
|
||||||
showTaskCenter = true // 上传时是否显示任务中心
|
showTaskCenter = true, // 上传时是否显示任务中心
|
||||||
|
enableStreamUpload = true // 是否启用流式分割上传
|
||||||
) {
|
) {
|
||||||
const { message } = App.useApp();
|
const { message } = App.useApp();
|
||||||
const [taskList, setTaskList] = useState<TaskItem[]>([]);
|
const [taskList, setTaskList] = useState<TaskItem[]>([]);
|
||||||
const taskListRef = useRef<TaskItem[]>([]); // 用于固定任务顺序
|
const taskListRef = useRef<TaskItem[]>([]); // 用于固定任务顺序
|
||||||
|
|
||||||
const createTask = (detail: any = {}) => {
|
const createTask = (detail: Record<string, unknown> = {}) => {
|
||||||
const { dataset } = detail;
|
const { dataset } = detail;
|
||||||
const title = `上传数据集: ${dataset.name} `;
|
const title = `上传数据集: ${dataset.name} `;
|
||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
@@ -37,6 +38,14 @@ export function useFileSliceUpload(
|
|||||||
taskListRef.current = [task, ...taskListRef.current];
|
taskListRef.current = [task, ...taskListRef.current];
|
||||||
|
|
||||||
setTaskList(taskListRef.current);
|
setTaskList(taskListRef.current);
|
||||||
|
|
||||||
|
// 立即显示任务中心,让用户感知上传已开始
|
||||||
|
if (showTaskCenter) {
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("show:task-popover", { detail: { show: true } })
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return task;
|
return task;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -60,7 +69,7 @@ export function useFileSliceUpload(
|
|||||||
// 携带前缀信息,便于刷新后仍停留在当前目录
|
// 携带前缀信息,便于刷新后仍停留在当前目录
|
||||||
window.dispatchEvent(
|
window.dispatchEvent(
|
||||||
new CustomEvent(task.updateEvent, {
|
new CustomEvent(task.updateEvent, {
|
||||||
detail: { prefix: (task as any).prefix },
|
detail: { prefix: task.prefix },
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -71,7 +80,7 @@ export function useFileSliceUpload(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
async function buildFormData({ file, reqId, i, j }) {
|
async function buildFormData({ file, reqId, i, j }: { file: { slices: Blob[]; name: string; size: number }; reqId: number; i: number; j: number }) {
|
||||||
const formData = new FormData();
|
const formData = new FormData();
|
||||||
const { slices, name, size } = file;
|
const { slices, name, size } = file;
|
||||||
const checkSum = await calculateSHA256(slices[j]);
|
const checkSum = await calculateSHA256(slices[j]);
|
||||||
@@ -86,12 +95,18 @@ export function useFileSliceUpload(
|
|||||||
return formData;
|
return formData;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function uploadSlice(task: TaskItem, fileInfo) {
|
async function uploadSlice(task: TaskItem, fileInfo: { loaded: number; i: number; j: number; files: { slices: Blob[]; name: string; size: number }[]; totalSize: number }) {
|
||||||
if (!task) {
|
if (!task) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const { reqId, key } = task;
|
const { reqId, key, controller } = task;
|
||||||
const { loaded, i, j, files, totalSize } = fileInfo;
|
const { loaded, i, j, files, totalSize } = fileInfo;
|
||||||
|
|
||||||
|
// 检查是否已取消
|
||||||
|
if (controller.signal.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
const formData = await buildFormData({
|
const formData = await buildFormData({
|
||||||
file: files[i],
|
file: files[i],
|
||||||
i,
|
i,
|
||||||
@@ -101,6 +116,7 @@ export function useFileSliceUpload(
|
|||||||
|
|
||||||
let newTask = { ...task };
|
let newTask = { ...task };
|
||||||
await uploadChunk(key, formData, {
|
await uploadChunk(key, formData, {
|
||||||
|
signal: controller.signal,
|
||||||
onUploadProgress: (e) => {
|
onUploadProgress: (e) => {
|
||||||
const loadedSize = loaded + e.loaded;
|
const loadedSize = loaded + e.loaded;
|
||||||
const curPercent = Number((loadedSize / totalSize) * 100).toFixed(2);
|
const curPercent = Number((loadedSize / totalSize) * 100).toFixed(2);
|
||||||
@@ -116,7 +132,7 @@ export function useFileSliceUpload(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function uploadFile({ task, files, totalSize }) {
|
async function uploadFile({ task, files, totalSize }: { task: TaskItem; files: { slices: Blob[]; name: string; size: number; originFile: Blob }[]; totalSize: number }) {
|
||||||
console.log('[useSliceUpload] Calling preUpload with prefix:', task.prefix);
|
console.log('[useSliceUpload] Calling preUpload with prefix:', task.prefix);
|
||||||
const { data: reqId } = await preUpload(task.key, {
|
const { data: reqId } = await preUpload(task.key, {
|
||||||
totalFileNum: files.length,
|
totalFileNum: files.length,
|
||||||
@@ -132,24 +148,29 @@ export function useFileSliceUpload(
|
|||||||
reqId,
|
reqId,
|
||||||
isCancel: false,
|
isCancel: false,
|
||||||
cancelFn: () => {
|
cancelFn: () => {
|
||||||
task.controller.abort();
|
// 使用 newTask 的 controller 确保一致性
|
||||||
|
newTask.controller.abort();
|
||||||
cancelUpload?.(reqId);
|
cancelUpload?.(reqId);
|
||||||
if (task.updateEvent) window.dispatchEvent(new Event(task.updateEvent));
|
if (newTask.updateEvent) window.dispatchEvent(new Event(newTask.updateEvent));
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
updateTaskList(newTask);
|
updateTaskList(newTask);
|
||||||
if (showTaskCenter) {
|
// 注意:show:task-popover 事件已在 createTask 中触发,此处不再重复触发
|
||||||
window.dispatchEvent(
|
|
||||||
new CustomEvent("show:task-popover", { detail: { show: true } })
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// // 更新数据状态
|
// // 更新数据状态
|
||||||
if (task.updateEvent) window.dispatchEvent(new Event(task.updateEvent));
|
if (task.updateEvent) window.dispatchEvent(new Event(task.updateEvent));
|
||||||
|
|
||||||
let loaded = 0;
|
let loaded = 0;
|
||||||
for (let i = 0; i < files.length; i++) {
|
for (let i = 0; i < files.length; i++) {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (newTask.controller.signal.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
const { slices } = files[i];
|
const { slices } = files[i];
|
||||||
for (let j = 0; j < slices.length; j++) {
|
for (let j = 0; j < slices.length; j++) {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (newTask.controller.signal.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
await uploadSlice(newTask, {
|
await uploadSlice(newTask, {
|
||||||
loaded,
|
loaded,
|
||||||
i,
|
i,
|
||||||
@@ -163,7 +184,7 @@ export function useFileSliceUpload(
|
|||||||
removeTask(newTask);
|
removeTask(newTask);
|
||||||
}
|
}
|
||||||
|
|
||||||
const handleUpload = async ({ task, files }) => {
|
const handleUpload = async ({ task, files }: { task: TaskItem; files: { slices: Blob[]; name: string; size: number; originFile: Blob }[] }) => {
|
||||||
const isErrorFile = await checkIsFilesExist(files);
|
const isErrorFile = await checkIsFilesExist(files);
|
||||||
if (isErrorFile) {
|
if (isErrorFile) {
|
||||||
message.error("文件被修改或删除,请重新选择文件上传");
|
message.error("文件被修改或删除,请重新选择文件上传");
|
||||||
@@ -189,10 +210,174 @@ export function useFileSliceUpload(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 流式分割上传处理
|
||||||
|
* 用于大文件按行分割并立即上传的场景
|
||||||
|
*/
|
||||||
|
const handleStreamUpload = async ({ task, files }: { task: TaskItem; files: File[] }) => {
|
||||||
|
try {
|
||||||
|
console.log('[useSliceUpload] Starting stream upload for', files.length, 'files');
|
||||||
|
|
||||||
|
const totalSize = files.reduce((acc, file) => acc + file.size, 0);
|
||||||
|
|
||||||
|
// 存储所有文件的 reqId,用于取消上传
|
||||||
|
const reqIds: number[] = [];
|
||||||
|
|
||||||
|
const newTask: TaskItem = {
|
||||||
|
...task,
|
||||||
|
reqId: -1,
|
||||||
|
isCancel: false,
|
||||||
|
cancelFn: () => {
|
||||||
|
// 使用 newTask 的 controller 确保一致性
|
||||||
|
newTask.controller.abort();
|
||||||
|
// 取消所有文件的预上传请求
|
||||||
|
reqIds.forEach(id => cancelUpload?.(id));
|
||||||
|
if (newTask.updateEvent) window.dispatchEvent(new Event(newTask.updateEvent));
|
||||||
|
},
|
||||||
|
};
|
||||||
|
updateTaskList(newTask);
|
||||||
|
|
||||||
|
let totalUploadedLines = 0;
|
||||||
|
let totalProcessedBytes = 0;
|
||||||
|
const results: StreamUploadResult[] = [];
|
||||||
|
|
||||||
|
// 逐个处理文件,每个文件单独调用 preUpload
|
||||||
|
for (let i = 0; i < files.length; i++) {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (newTask.controller.signal.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
|
const file = files[i];
|
||||||
|
console.log(`[useSliceUpload] Processing file ${i + 1}/${files.length}: ${file.name}`);
|
||||||
|
|
||||||
|
const result = await streamSplitAndUpload(
|
||||||
|
file,
|
||||||
|
(formData, config) => uploadChunk(task.key, formData, {
|
||||||
|
...config,
|
||||||
|
signal: newTask.controller.signal,
|
||||||
|
}),
|
||||||
|
(currentBytes, totalBytes, uploadedLines) => {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (newTask.controller.signal.aborted) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 更新进度
|
||||||
|
const overallBytes = totalProcessedBytes + currentBytes;
|
||||||
|
const curPercent = Number((overallBytes / totalSize) * 100).toFixed(2);
|
||||||
|
|
||||||
|
const updatedTask: TaskItem = {
|
||||||
|
...newTask,
|
||||||
|
...taskListRef.current.find((item) => item.key === task.key),
|
||||||
|
size: overallBytes,
|
||||||
|
percent: curPercent >= 100 ? 99.99 : curPercent,
|
||||||
|
streamUploadInfo: {
|
||||||
|
currentFile: file.name,
|
||||||
|
fileIndex: i + 1,
|
||||||
|
totalFiles: files.length,
|
||||||
|
uploadedLines: totalUploadedLines + uploadedLines,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
updateTaskList(updatedTask);
|
||||||
|
},
|
||||||
|
1024 * 1024, // 1MB chunk size
|
||||||
|
{
|
||||||
|
resolveReqId: async ({ totalFileNum, totalSize }) => {
|
||||||
|
const { data: reqId } = await preUpload(task.key, {
|
||||||
|
totalFileNum,
|
||||||
|
totalSize,
|
||||||
|
datasetId: task.key,
|
||||||
|
hasArchive: task.hasArchive,
|
||||||
|
prefix: task.prefix,
|
||||||
|
});
|
||||||
|
console.log(`[useSliceUpload] File ${file.name} preUpload response reqId:`, reqId);
|
||||||
|
reqIds.push(reqId);
|
||||||
|
return reqId;
|
||||||
|
},
|
||||||
|
hasArchive: newTask.hasArchive,
|
||||||
|
prefix: newTask.prefix,
|
||||||
|
signal: newTask.controller.signal,
|
||||||
|
maxConcurrency: 3,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
results.push(result);
|
||||||
|
totalUploadedLines += result.uploadedCount;
|
||||||
|
totalProcessedBytes += file.size;
|
||||||
|
|
||||||
|
console.log(`[useSliceUpload] File ${file.name} processed, uploaded ${result.uploadedCount} lines`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('[useSliceUpload] Stream upload completed, total lines:', totalUploadedLines);
|
||||||
|
removeTask(newTask);
|
||||||
|
|
||||||
|
message.success(`成功上传 ${totalUploadedLines} 个文件(按行分割)`);
|
||||||
|
} catch (err) {
|
||||||
|
console.error('[useSliceUpload] Stream upload error:', err);
|
||||||
|
if (err.message === "Upload cancelled") {
|
||||||
|
message.info("上传已取消");
|
||||||
|
} else {
|
||||||
|
message.error("文件上传失败,请稍后重试");
|
||||||
|
}
|
||||||
|
removeTask({
|
||||||
|
...task,
|
||||||
|
isCancel: true,
|
||||||
|
...taskListRef.current.find((item) => item.key === task.key),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 注册流式上传事件监听
|
||||||
|
* 返回注销函数
|
||||||
|
*/
|
||||||
|
const registerStreamUploadListener = () => {
|
||||||
|
if (!enableStreamUpload) return () => {};
|
||||||
|
|
||||||
|
const streamUploadHandler = async (e: Event) => {
|
||||||
|
const customEvent = e as CustomEvent;
|
||||||
|
const { dataset, files, updateEvent, hasArchive, prefix } = customEvent.detail;
|
||||||
|
|
||||||
|
const controller = new AbortController();
|
||||||
|
const task: TaskItem = {
|
||||||
|
key: dataset.id,
|
||||||
|
title: `上传数据集: ${dataset.name} (按行分割)`,
|
||||||
|
percent: 0,
|
||||||
|
reqId: -1,
|
||||||
|
controller,
|
||||||
|
size: 0,
|
||||||
|
updateEvent,
|
||||||
|
hasArchive,
|
||||||
|
prefix,
|
||||||
|
};
|
||||||
|
|
||||||
|
taskListRef.current = [task, ...taskListRef.current];
|
||||||
|
setTaskList(taskListRef.current);
|
||||||
|
|
||||||
|
// 显示任务中心
|
||||||
|
if (showTaskCenter) {
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("show:task-popover", { detail: { show: true } })
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
await handleStreamUpload({ task, files });
|
||||||
|
};
|
||||||
|
|
||||||
|
window.addEventListener("upload:dataset-stream", streamUploadHandler);
|
||||||
|
|
||||||
|
return () => {
|
||||||
|
window.removeEventListener("upload:dataset-stream", streamUploadHandler);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
return {
|
return {
|
||||||
taskList,
|
taskList,
|
||||||
createTask,
|
createTask,
|
||||||
removeTask,
|
removeTask,
|
||||||
handleUpload,
|
handleUpload,
|
||||||
|
handleStreamUpload,
|
||||||
|
registerStreamUploadListener,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,9 @@
|
|||||||
* 通过 iframe 加载外部页面
|
* 通过 iframe 加载外部页面
|
||||||
*/
|
*/
|
||||||
export default function ContentGenerationPage() {
|
export default function ContentGenerationPage() {
|
||||||
const iframeUrl = "http://192.168.0.8:3000";
|
const iframeUrl = "/api#/meeting";
|
||||||
|
|
||||||
|
window.localStorage.setItem("geeker-user", '{"token":"123","userInfo":{"name":"xteam"},"loginFrom":null,"loginData":null}');
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="h-full w-full flex flex-col">
|
<div className="h-full w-full flex flex-col">
|
||||||
@@ -16,6 +18,11 @@ export default function ContentGenerationPage() {
|
|||||||
className="w-full h-full border-0"
|
className="w-full h-full border-0"
|
||||||
title="内容生成"
|
title="内容生成"
|
||||||
sandbox="allow-same-origin allow-scripts allow-popups allow-forms allow-downloads"
|
sandbox="allow-same-origin allow-scripts allow-popups allow-forms allow-downloads"
|
||||||
|
style={{marginLeft: "-220px",
|
||||||
|
marginTop: "-66px",
|
||||||
|
width: "calc(100% + 233px)",
|
||||||
|
height: "calc(100% + 108px)"
|
||||||
|
}}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import { useCallback, useEffect, useMemo, useRef, useState } from "react";
|
import { useCallback, useEffect, useMemo, useRef, useState } from "react";
|
||||||
import { App, Button, Card, List, Spin, Typography, Tag, Switch, Tree, Empty } from "antd";
|
import { App, Button, Card, List, Spin, Typography, Tag, Empty } from "antd";
|
||||||
import { LeftOutlined, ReloadOutlined, SaveOutlined, MenuFoldOutlined, MenuUnfoldOutlined, CheckOutlined } from "@ant-design/icons";
|
import { LeftOutlined, ReloadOutlined, SaveOutlined, MenuFoldOutlined, MenuUnfoldOutlined } from "@ant-design/icons";
|
||||||
import { useNavigate, useParams } from "react-router";
|
import { useNavigate, useParams } from "react-router";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
@@ -28,7 +28,6 @@ type EditorTaskListItem = {
|
|||||||
hasAnnotation: boolean;
|
hasAnnotation: boolean;
|
||||||
annotationUpdatedAt?: string | null;
|
annotationUpdatedAt?: string | null;
|
||||||
annotationStatus?: AnnotationResultStatus | null;
|
annotationStatus?: AnnotationResultStatus | null;
|
||||||
segmentStats?: SegmentStats;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
type LsfMessage = {
|
type LsfMessage = {
|
||||||
@@ -36,21 +35,6 @@ type LsfMessage = {
|
|||||||
payload?: unknown;
|
payload?: unknown;
|
||||||
};
|
};
|
||||||
|
|
||||||
type SegmentInfo = {
|
|
||||||
idx: number;
|
|
||||||
text: string;
|
|
||||||
start: number;
|
|
||||||
end: number;
|
|
||||||
hasAnnotation: boolean;
|
|
||||||
lineIndex: number;
|
|
||||||
chunkIndex: number;
|
|
||||||
};
|
|
||||||
|
|
||||||
type SegmentStats = {
|
|
||||||
done: number;
|
|
||||||
total: number;
|
|
||||||
};
|
|
||||||
|
|
||||||
type ApiResponse<T> = {
|
type ApiResponse<T> = {
|
||||||
code?: number;
|
code?: number;
|
||||||
message?: string;
|
message?: string;
|
||||||
@@ -66,10 +50,11 @@ type EditorTaskPayload = {
|
|||||||
type EditorTaskResponse = {
|
type EditorTaskResponse = {
|
||||||
task?: EditorTaskPayload;
|
task?: EditorTaskPayload;
|
||||||
segmented?: boolean;
|
segmented?: boolean;
|
||||||
segments?: SegmentInfo[];
|
totalSegments?: number;
|
||||||
currentSegmentIndex?: number;
|
currentSegmentIndex?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
type EditorTaskListResponse = {
|
type EditorTaskListResponse = {
|
||||||
content?: EditorTaskListItem[];
|
content?: EditorTaskListItem[];
|
||||||
totalElements?: number;
|
totalElements?: number;
|
||||||
@@ -91,8 +76,6 @@ type ExportPayload = {
|
|||||||
requestId?: string | null;
|
requestId?: string | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
type SwitchDecision = "save" | "discard" | "cancel";
|
|
||||||
|
|
||||||
const LSF_IFRAME_SRC = "/lsf/lsf.html";
|
const LSF_IFRAME_SRC = "/lsf/lsf.html";
|
||||||
const TASK_PAGE_START = 0;
|
const TASK_PAGE_START = 0;
|
||||||
const TASK_PAGE_SIZE = 200;
|
const TASK_PAGE_SIZE = 200;
|
||||||
@@ -154,16 +137,6 @@ const isAnnotationResultEmpty = (annotation?: Record<string, unknown>) => {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const resolveTaskStatusMeta = (item: EditorTaskListItem) => {
|
const resolveTaskStatusMeta = (item: EditorTaskListItem) => {
|
||||||
const segmentSummary = resolveSegmentSummary(item);
|
|
||||||
if (segmentSummary) {
|
|
||||||
if (segmentSummary.done >= segmentSummary.total) {
|
|
||||||
return { text: "已标注", type: "success" as const };
|
|
||||||
}
|
|
||||||
if (segmentSummary.done > 0) {
|
|
||||||
return { text: "标注中", type: "warning" as const };
|
|
||||||
}
|
|
||||||
return { text: "未标注", type: "secondary" as const };
|
|
||||||
}
|
|
||||||
if (!item.hasAnnotation) {
|
if (!item.hasAnnotation) {
|
||||||
return { text: "未标注", type: "secondary" as const };
|
return { text: "未标注", type: "secondary" as const };
|
||||||
}
|
}
|
||||||
@@ -216,25 +189,6 @@ const buildAnnotationSnapshot = (annotation?: Record<string, unknown>) => {
|
|||||||
const buildSnapshotKey = (fileId: string, segmentIndex?: number) =>
|
const buildSnapshotKey = (fileId: string, segmentIndex?: number) =>
|
||||||
`${fileId}::${segmentIndex ?? "full"}`;
|
`${fileId}::${segmentIndex ?? "full"}`;
|
||||||
|
|
||||||
const buildSegmentStats = (segmentList?: SegmentInfo[] | null): SegmentStats | null => {
|
|
||||||
if (!Array.isArray(segmentList) || segmentList.length === 0) return null;
|
|
||||||
const total = segmentList.length;
|
|
||||||
const done = segmentList.reduce((count, seg) => count + (seg.hasAnnotation ? 1 : 0), 0);
|
|
||||||
return { done, total };
|
|
||||||
};
|
|
||||||
|
|
||||||
const normalizeSegmentStats = (stats?: SegmentStats | null): SegmentStats | null => {
|
|
||||||
if (!stats) return null;
|
|
||||||
const total = Number(stats.total);
|
|
||||||
const done = Number(stats.done);
|
|
||||||
if (!Number.isFinite(total) || total <= 0) return null;
|
|
||||||
const safeDone = Math.min(Math.max(done, 0), total);
|
|
||||||
return { done: safeDone, total };
|
|
||||||
};
|
|
||||||
|
|
||||||
const resolveSegmentSummary = (item: EditorTaskListItem) =>
|
|
||||||
normalizeSegmentStats(item.segmentStats);
|
|
||||||
|
|
||||||
const mergeTaskItems = (base: EditorTaskListItem[], next: EditorTaskListItem[]) => {
|
const mergeTaskItems = (base: EditorTaskListItem[], next: EditorTaskListItem[]) => {
|
||||||
if (next.length === 0) return base;
|
if (next.length === 0) return base;
|
||||||
const seen = new Set(base.map((item) => item.fileId));
|
const seen = new Set(base.map((item) => item.fileId));
|
||||||
@@ -282,18 +236,13 @@ export default function LabelStudioTextEditor() {
|
|||||||
resolve: (payload?: ExportPayload) => void;
|
resolve: (payload?: ExportPayload) => void;
|
||||||
timer?: number;
|
timer?: number;
|
||||||
} | null>(null);
|
} | null>(null);
|
||||||
const exportCheckSeqRef = useRef(0);
|
|
||||||
const savedSnapshotsRef = useRef<Record<string, string>>({});
|
const savedSnapshotsRef = useRef<Record<string, string>>({});
|
||||||
const pendingAutoAdvanceRef = useRef(false);
|
const pendingAutoAdvanceRef = useRef(false);
|
||||||
const segmentStatsCacheRef = useRef<Record<string, SegmentStats>>({});
|
|
||||||
const segmentStatsSeqRef = useRef(0);
|
|
||||||
const segmentStatsLoadingRef = useRef<Set<string>>(new Set());
|
|
||||||
|
|
||||||
const [loadingProject, setLoadingProject] = useState(true);
|
const [loadingProject, setLoadingProject] = useState(true);
|
||||||
const [loadingTasks, setLoadingTasks] = useState(false);
|
const [loadingTasks, setLoadingTasks] = useState(false);
|
||||||
const [loadingTaskDetail, setLoadingTaskDetail] = useState(false);
|
const [loadingTaskDetail, setLoadingTaskDetail] = useState(false);
|
||||||
const [saving, setSaving] = useState(false);
|
const [saving, setSaving] = useState(false);
|
||||||
const [segmentSwitching, setSegmentSwitching] = useState(false);
|
|
||||||
|
|
||||||
const [iframeReady, setIframeReady] = useState(false);
|
const [iframeReady, setIframeReady] = useState(false);
|
||||||
const [lsReady, setLsReady] = useState(false);
|
const [lsReady, setLsReady] = useState(false);
|
||||||
@@ -306,16 +255,19 @@ export default function LabelStudioTextEditor() {
|
|||||||
const [prefetching, setPrefetching] = useState(false);
|
const [prefetching, setPrefetching] = useState(false);
|
||||||
const [selectedFileId, setSelectedFileId] = useState<string>("");
|
const [selectedFileId, setSelectedFileId] = useState<string>("");
|
||||||
const [sidebarCollapsed, setSidebarCollapsed] = useState(false);
|
const [sidebarCollapsed, setSidebarCollapsed] = useState(false);
|
||||||
const [autoSaveOnSwitch, setAutoSaveOnSwitch] = useState(false);
|
|
||||||
|
|
||||||
// 分段相关状态
|
// 分段相关状态
|
||||||
const [segmented, setSegmented] = useState(false);
|
const [segmented, setSegmented] = useState(false);
|
||||||
const [segments, setSegments] = useState<SegmentInfo[]>([]);
|
|
||||||
const [currentSegmentIndex, setCurrentSegmentIndex] = useState(0);
|
const [currentSegmentIndex, setCurrentSegmentIndex] = useState(0);
|
||||||
|
const [segmentTotal, setSegmentTotal] = useState(0);
|
||||||
const isTextProject = useMemo(
|
const isTextProject = useMemo(
|
||||||
() => (project?.datasetType || "").toUpperCase() === "TEXT",
|
() => (project?.datasetType || "").toUpperCase() === "TEXT",
|
||||||
[project?.datasetType],
|
[project?.datasetType],
|
||||||
);
|
);
|
||||||
|
const segmentIndices = useMemo(() => {
|
||||||
|
if (segmentTotal <= 0) return [] as number[];
|
||||||
|
return Array.from({ length: segmentTotal }, (_, index) => index);
|
||||||
|
}, [segmentTotal]);
|
||||||
|
|
||||||
const focusIframe = useCallback(() => {
|
const focusIframe = useCallback(() => {
|
||||||
const iframe = iframeRef.current;
|
const iframe = iframeRef.current;
|
||||||
@@ -330,70 +282,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
win.postMessage({ type, payload }, origin);
|
win.postMessage({ type, payload }, origin);
|
||||||
}, [origin]);
|
}, [origin]);
|
||||||
|
|
||||||
const applySegmentStats = useCallback((fileId: string, stats: SegmentStats | null) => {
|
|
||||||
if (!fileId) return;
|
|
||||||
const normalized = normalizeSegmentStats(stats);
|
|
||||||
setTasks((prev) =>
|
|
||||||
prev.map((item) =>
|
|
||||||
item.fileId === fileId
|
|
||||||
? { ...item, segmentStats: normalized || undefined }
|
|
||||||
: item
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
const updateSegmentStatsCache = useCallback((fileId: string, stats: SegmentStats | null) => {
|
|
||||||
if (!fileId) return;
|
|
||||||
const normalized = normalizeSegmentStats(stats);
|
|
||||||
if (normalized) {
|
|
||||||
segmentStatsCacheRef.current[fileId] = normalized;
|
|
||||||
} else {
|
|
||||||
delete segmentStatsCacheRef.current[fileId];
|
|
||||||
}
|
|
||||||
applySegmentStats(fileId, normalized);
|
|
||||||
}, [applySegmentStats]);
|
|
||||||
|
|
||||||
const fetchSegmentStatsForFile = useCallback(async (fileId: string, seq: number) => {
|
|
||||||
if (!projectId || !fileId) return;
|
|
||||||
if (segmentStatsCacheRef.current[fileId] || segmentStatsLoadingRef.current.has(fileId)) return;
|
|
||||||
segmentStatsLoadingRef.current.add(fileId);
|
|
||||||
try {
|
|
||||||
const resp = (await getEditorTaskUsingGet(projectId, fileId, {
|
|
||||||
segmentIndex: 0,
|
|
||||||
})) as ApiResponse<EditorTaskResponse>;
|
|
||||||
if (segmentStatsSeqRef.current !== seq) return;
|
|
||||||
const data = resp?.data;
|
|
||||||
if (!data?.segmented) return;
|
|
||||||
const stats = buildSegmentStats(data.segments);
|
|
||||||
if (!stats) return;
|
|
||||||
segmentStatsCacheRef.current[fileId] = stats;
|
|
||||||
applySegmentStats(fileId, stats);
|
|
||||||
} catch (e) {
|
|
||||||
console.error(e);
|
|
||||||
} finally {
|
|
||||||
segmentStatsLoadingRef.current.delete(fileId);
|
|
||||||
}
|
|
||||||
}, [applySegmentStats, projectId]);
|
|
||||||
|
|
||||||
const prefetchSegmentStats = useCallback((items: EditorTaskListItem[]) => {
|
|
||||||
if (!projectId) return;
|
|
||||||
const fileIds = items
|
|
||||||
.map((item) => item.fileId)
|
|
||||||
.filter((fileId) => fileId && !segmentStatsCacheRef.current[fileId]);
|
|
||||||
if (fileIds.length === 0) return;
|
|
||||||
const seq = segmentStatsSeqRef.current;
|
|
||||||
let cursor = 0;
|
|
||||||
const workerCount = Math.min(3, fileIds.length);
|
|
||||||
const runWorker = async () => {
|
|
||||||
while (cursor < fileIds.length && segmentStatsSeqRef.current === seq) {
|
|
||||||
const fileId = fileIds[cursor];
|
|
||||||
cursor += 1;
|
|
||||||
await fetchSegmentStatsForFile(fileId, seq);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
void Promise.all(Array.from({ length: workerCount }, () => runWorker()));
|
|
||||||
}, [fetchSegmentStatsForFile, projectId]);
|
|
||||||
|
|
||||||
const confirmEmptyAnnotationStatus = useCallback(() => {
|
const confirmEmptyAnnotationStatus = useCallback(() => {
|
||||||
return new Promise<AnnotationResultStatus | null>((resolve) => {
|
return new Promise<AnnotationResultStatus | null>((resolve) => {
|
||||||
let resolved = false;
|
let resolved = false;
|
||||||
@@ -446,8 +334,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
|
|
||||||
const updateTaskSelection = useCallback((items: EditorTaskListItem[]) => {
|
const updateTaskSelection = useCallback((items: EditorTaskListItem[]) => {
|
||||||
const isCompleted = (item: EditorTaskListItem) => {
|
const isCompleted = (item: EditorTaskListItem) => {
|
||||||
const summary = resolveSegmentSummary(item);
|
|
||||||
if (summary) return summary.done >= summary.total;
|
|
||||||
return item.hasAnnotation;
|
return item.hasAnnotation;
|
||||||
};
|
};
|
||||||
const defaultFileId =
|
const defaultFileId =
|
||||||
@@ -508,9 +394,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
if (mode === "reset") {
|
if (mode === "reset") {
|
||||||
prefetchSeqRef.current += 1;
|
prefetchSeqRef.current += 1;
|
||||||
setPrefetching(false);
|
setPrefetching(false);
|
||||||
segmentStatsSeqRef.current += 1;
|
|
||||||
segmentStatsCacheRef.current = {};
|
|
||||||
segmentStatsLoadingRef.current = new Set();
|
|
||||||
}
|
}
|
||||||
if (mode === "append") {
|
if (mode === "append") {
|
||||||
setLoadingMore(true);
|
setLoadingMore(true);
|
||||||
@@ -591,20 +474,19 @@ export default function LabelStudioTextEditor() {
|
|||||||
if (seq !== initSeqRef.current) return;
|
if (seq !== initSeqRef.current) return;
|
||||||
|
|
||||||
// 更新分段状态
|
// 更新分段状态
|
||||||
const segmentIndex = data?.segmented
|
const isSegmented = !!data?.segmented;
|
||||||
|
const segmentIndex = isSegmented
|
||||||
? resolveSegmentIndex(data.currentSegmentIndex) ?? 0
|
? resolveSegmentIndex(data.currentSegmentIndex) ?? 0
|
||||||
: undefined;
|
: undefined;
|
||||||
if (data?.segmented) {
|
if (isSegmented) {
|
||||||
const stats = buildSegmentStats(data.segments);
|
|
||||||
setSegmented(true);
|
setSegmented(true);
|
||||||
setSegments(data.segments || []);
|
|
||||||
setCurrentSegmentIndex(segmentIndex ?? 0);
|
setCurrentSegmentIndex(segmentIndex ?? 0);
|
||||||
updateSegmentStatsCache(fileId, stats);
|
const totalSegments = Number(data?.totalSegments ?? 0);
|
||||||
|
setSegmentTotal(Number.isFinite(totalSegments) && totalSegments > 0 ? totalSegments : 0);
|
||||||
} else {
|
} else {
|
||||||
setSegmented(false);
|
setSegmented(false);
|
||||||
setSegments([]);
|
|
||||||
setCurrentSegmentIndex(0);
|
setCurrentSegmentIndex(0);
|
||||||
updateSegmentStatsCache(fileId, null);
|
setSegmentTotal(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
const taskData = {
|
const taskData = {
|
||||||
@@ -664,19 +546,14 @@ export default function LabelStudioTextEditor() {
|
|||||||
} finally {
|
} finally {
|
||||||
if (seq === initSeqRef.current) setLoadingTaskDetail(false);
|
if (seq === initSeqRef.current) setLoadingTaskDetail(false);
|
||||||
}
|
}
|
||||||
}, [iframeReady, message, postToIframe, project, projectId, updateSegmentStatsCache]);
|
}, [iframeReady, message, postToIframe, project, projectId]);
|
||||||
|
|
||||||
const advanceAfterSave = useCallback(async (fileId: string, segmentIndex?: number) => {
|
const advanceAfterSave = useCallback(async (fileId: string, segmentIndex?: number) => {
|
||||||
if (!fileId) return;
|
if (!fileId) return;
|
||||||
if (segmented && segments.length > 0) {
|
if (segmented && segmentTotal > 0) {
|
||||||
const sortedSegmentIndices = segments
|
const baseIndex = Math.max(segmentIndex ?? currentSegmentIndex, 0);
|
||||||
.map((seg) => seg.idx)
|
const nextSegmentIndex = baseIndex + 1;
|
||||||
.sort((a, b) => a - b);
|
if (nextSegmentIndex < segmentTotal) {
|
||||||
const baseIndex = segmentIndex ?? currentSegmentIndex;
|
|
||||||
const currentPos = sortedSegmentIndices.indexOf(baseIndex);
|
|
||||||
const nextSegmentIndex =
|
|
||||||
currentPos >= 0 ? sortedSegmentIndices[currentPos + 1] : sortedSegmentIndices[0];
|
|
||||||
if (nextSegmentIndex !== undefined) {
|
|
||||||
await initEditorForFile(fileId, nextSegmentIndex);
|
await initEditorForFile(fileId, nextSegmentIndex);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -698,7 +575,7 @@ export default function LabelStudioTextEditor() {
|
|||||||
initEditorForFile,
|
initEditorForFile,
|
||||||
message,
|
message,
|
||||||
segmented,
|
segmented,
|
||||||
segments,
|
segmentTotal,
|
||||||
tasks,
|
tasks,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
@@ -772,16 +649,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
const snapshot = buildAnnotationSnapshot(isRecord(annotation) ? annotation : undefined);
|
const snapshot = buildAnnotationSnapshot(isRecord(annotation) ? annotation : undefined);
|
||||||
savedSnapshotsRef.current[snapshotKey] = snapshot;
|
savedSnapshotsRef.current[snapshotKey] = snapshot;
|
||||||
|
|
||||||
// 分段模式下更新当前段落的标注状态
|
|
||||||
if (segmented && segmentIndex !== undefined) {
|
|
||||||
const nextSegments = segments.map((seg) =>
|
|
||||||
seg.idx === segmentIndex
|
|
||||||
? { ...seg, hasAnnotation: true }
|
|
||||||
: seg
|
|
||||||
);
|
|
||||||
setSegments(nextSegments);
|
|
||||||
updateSegmentStatsCache(String(fileId), buildSegmentStats(nextSegments));
|
|
||||||
}
|
|
||||||
if (options?.autoAdvance) {
|
if (options?.autoAdvance) {
|
||||||
await advanceAfterSave(String(fileId), segmentIndex);
|
await advanceAfterSave(String(fileId), segmentIndex);
|
||||||
}
|
}
|
||||||
@@ -800,69 +667,10 @@ export default function LabelStudioTextEditor() {
|
|||||||
message,
|
message,
|
||||||
projectId,
|
projectId,
|
||||||
segmented,
|
segmented,
|
||||||
segments,
|
|
||||||
selectedFileId,
|
selectedFileId,
|
||||||
tasks,
|
tasks,
|
||||||
updateSegmentStatsCache,
|
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const requestExportForCheck = useCallback(() => {
|
|
||||||
if (!iframeReady || !lsReady) return Promise.resolve(undefined);
|
|
||||||
if (exportCheckRef.current) {
|
|
||||||
if (exportCheckRef.current.timer) {
|
|
||||||
window.clearTimeout(exportCheckRef.current.timer);
|
|
||||||
}
|
|
||||||
exportCheckRef.current.resolve(undefined);
|
|
||||||
exportCheckRef.current = null;
|
|
||||||
}
|
|
||||||
const requestId = `check_${Date.now()}_${++exportCheckSeqRef.current}`;
|
|
||||||
return new Promise<ExportPayload | undefined>((resolve) => {
|
|
||||||
const timer = window.setTimeout(() => {
|
|
||||||
if (exportCheckRef.current?.requestId === requestId) {
|
|
||||||
exportCheckRef.current = null;
|
|
||||||
}
|
|
||||||
resolve(undefined);
|
|
||||||
}, 3000);
|
|
||||||
exportCheckRef.current = {
|
|
||||||
requestId,
|
|
||||||
resolve,
|
|
||||||
timer,
|
|
||||||
};
|
|
||||||
postToIframe("LS_EXPORT_CHECK", { requestId });
|
|
||||||
});
|
|
||||||
}, [iframeReady, lsReady, postToIframe]);
|
|
||||||
|
|
||||||
const confirmSaveBeforeSwitch = useCallback(() => {
|
|
||||||
return new Promise<SwitchDecision>((resolve) => {
|
|
||||||
let resolved = false;
|
|
||||||
let modalInstance: { destroy: () => void } | null = null;
|
|
||||||
const settle = (decision: SwitchDecision) => {
|
|
||||||
if (resolved) return;
|
|
||||||
resolved = true;
|
|
||||||
resolve(decision);
|
|
||||||
};
|
|
||||||
const handleDiscard = () => {
|
|
||||||
if (modalInstance) modalInstance.destroy();
|
|
||||||
settle("discard");
|
|
||||||
};
|
|
||||||
modalInstance = modal.confirm({
|
|
||||||
title: "当前段落有未保存标注",
|
|
||||||
content: (
|
|
||||||
<div className="flex flex-col gap-2">
|
|
||||||
<Typography.Text>切换段落前请先保存当前标注。</Typography.Text>
|
|
||||||
<Button type="link" danger style={{ padding: 0, height: "auto" }} onClick={handleDiscard}>
|
|
||||||
放弃未保存并切换
|
|
||||||
</Button>
|
|
||||||
</div>
|
|
||||||
),
|
|
||||||
okText: "保存并切换",
|
|
||||||
cancelText: "取消",
|
|
||||||
onOk: () => settle("save"),
|
|
||||||
onCancel: () => settle("cancel"),
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}, [modal]);
|
|
||||||
|
|
||||||
const requestExport = useCallback((autoAdvance: boolean) => {
|
const requestExport = useCallback((autoAdvance: boolean) => {
|
||||||
if (!selectedFileId) {
|
if (!selectedFileId) {
|
||||||
message.warning("请先选择文件");
|
message.warning("请先选择文件");
|
||||||
@@ -875,7 +683,7 @@ export default function LabelStudioTextEditor() {
|
|||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const handleSaveShortcut = (event: KeyboardEvent) => {
|
const handleSaveShortcut = (event: KeyboardEvent) => {
|
||||||
if (!isSaveShortcut(event) || event.repeat) return;
|
if (!isSaveShortcut(event) || event.repeat) return;
|
||||||
if (saving || loadingTaskDetail || segmentSwitching) return;
|
if (saving || loadingTaskDetail) return;
|
||||||
if (!iframeReady || !lsReady) return;
|
if (!iframeReady || !lsReady) return;
|
||||||
event.preventDefault();
|
event.preventDefault();
|
||||||
event.stopPropagation();
|
event.stopPropagation();
|
||||||
@@ -883,83 +691,7 @@ export default function LabelStudioTextEditor() {
|
|||||||
};
|
};
|
||||||
window.addEventListener("keydown", handleSaveShortcut);
|
window.addEventListener("keydown", handleSaveShortcut);
|
||||||
return () => window.removeEventListener("keydown", handleSaveShortcut);
|
return () => window.removeEventListener("keydown", handleSaveShortcut);
|
||||||
}, [iframeReady, loadingTaskDetail, lsReady, requestExport, saving, segmentSwitching]);
|
}, [iframeReady, loadingTaskDetail, lsReady, requestExport, saving]);
|
||||||
|
|
||||||
// 段落切换处理
|
|
||||||
const handleSegmentChange = useCallback(async (newIndex: number) => {
|
|
||||||
if (newIndex === currentSegmentIndex) return;
|
|
||||||
if (segmentSwitching || saving || loadingTaskDetail) return;
|
|
||||||
if (!iframeReady || !lsReady) {
|
|
||||||
message.warning("编辑器未就绪,无法切换段落");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
setSegmentSwitching(true);
|
|
||||||
try {
|
|
||||||
const payload = await requestExportForCheck();
|
|
||||||
if (!payload) {
|
|
||||||
message.warning("无法读取当前标注,已取消切换");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const payloadTaskId = payload.taskId;
|
|
||||||
if (expectedTaskIdRef.current && payloadTaskId) {
|
|
||||||
if (Number(payloadTaskId) !== expectedTaskIdRef.current) {
|
|
||||||
message.warning("已忽略过期的标注数据");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const payloadFileId = payload.fileId || selectedFileId;
|
|
||||||
const payloadSegmentIndex = resolveSegmentIndex(payload.segmentIndex);
|
|
||||||
const resolvedSegmentIndex =
|
|
||||||
payloadSegmentIndex !== undefined
|
|
||||||
? payloadSegmentIndex
|
|
||||||
: segmented
|
|
||||||
? currentSegmentIndex
|
|
||||||
: undefined;
|
|
||||||
const annotation = isRecord(payload.annotation) ? payload.annotation : undefined;
|
|
||||||
const snapshotKey = payloadFileId
|
|
||||||
? buildSnapshotKey(String(payloadFileId), resolvedSegmentIndex)
|
|
||||||
: undefined;
|
|
||||||
const latestSnapshot = buildAnnotationSnapshot(annotation);
|
|
||||||
const lastSnapshot = snapshotKey ? savedSnapshotsRef.current[snapshotKey] : undefined;
|
|
||||||
const hasUnsavedChange = snapshotKey !== undefined && lastSnapshot !== undefined && latestSnapshot !== lastSnapshot;
|
|
||||||
|
|
||||||
if (hasUnsavedChange) {
|
|
||||||
if (autoSaveOnSwitch) {
|
|
||||||
const saved = await saveFromExport(payload);
|
|
||||||
if (!saved) return;
|
|
||||||
} else {
|
|
||||||
const decision = await confirmSaveBeforeSwitch();
|
|
||||||
if (decision === "cancel") return;
|
|
||||||
if (decision === "save") {
|
|
||||||
const saved = await saveFromExport(payload);
|
|
||||||
if (!saved) return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
await initEditorForFile(selectedFileId, newIndex);
|
|
||||||
} finally {
|
|
||||||
setSegmentSwitching(false);
|
|
||||||
}
|
|
||||||
}, [
|
|
||||||
autoSaveOnSwitch,
|
|
||||||
confirmSaveBeforeSwitch,
|
|
||||||
currentSegmentIndex,
|
|
||||||
iframeReady,
|
|
||||||
initEditorForFile,
|
|
||||||
loadingTaskDetail,
|
|
||||||
lsReady,
|
|
||||||
message,
|
|
||||||
requestExportForCheck,
|
|
||||||
saveFromExport,
|
|
||||||
segmented,
|
|
||||||
selectedFileId,
|
|
||||||
segmentSwitching,
|
|
||||||
saving,
|
|
||||||
]);
|
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
setIframeReady(false);
|
setIframeReady(false);
|
||||||
@@ -977,12 +709,9 @@ export default function LabelStudioTextEditor() {
|
|||||||
expectedTaskIdRef.current = null;
|
expectedTaskIdRef.current = null;
|
||||||
// 重置分段状态
|
// 重置分段状态
|
||||||
setSegmented(false);
|
setSegmented(false);
|
||||||
setSegments([]);
|
|
||||||
setCurrentSegmentIndex(0);
|
setCurrentSegmentIndex(0);
|
||||||
|
setSegmentTotal(0);
|
||||||
savedSnapshotsRef.current = {};
|
savedSnapshotsRef.current = {};
|
||||||
segmentStatsSeqRef.current += 1;
|
|
||||||
segmentStatsCacheRef.current = {};
|
|
||||||
segmentStatsLoadingRef.current = new Set();
|
|
||||||
if (exportCheckRef.current?.timer) {
|
if (exportCheckRef.current?.timer) {
|
||||||
window.clearTimeout(exportCheckRef.current.timer);
|
window.clearTimeout(exportCheckRef.current.timer);
|
||||||
}
|
}
|
||||||
@@ -996,12 +725,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
loadTasks({ mode: "reset" });
|
loadTasks({ mode: "reset" });
|
||||||
}, [project?.supported, loadTasks]);
|
}, [project?.supported, loadTasks]);
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
if (!segmented) return;
|
|
||||||
if (tasks.length === 0) return;
|
|
||||||
prefetchSegmentStats(tasks);
|
|
||||||
}, [prefetchSegmentStats, segmented, tasks]);
|
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!selectedFileId) return;
|
if (!selectedFileId) return;
|
||||||
initEditorForFile(selectedFileId);
|
initEditorForFile(selectedFileId);
|
||||||
@@ -1026,60 +749,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
return () => window.removeEventListener("focus", handleWindowFocus);
|
return () => window.removeEventListener("focus", handleWindowFocus);
|
||||||
}, [focusIframe, lsReady]);
|
}, [focusIframe, lsReady]);
|
||||||
|
|
||||||
const segmentTreeData = useMemo(() => {
|
|
||||||
if (!segmented || segments.length === 0) return [];
|
|
||||||
const lineMap = new Map<number, SegmentInfo[]>();
|
|
||||||
segments.forEach((seg) => {
|
|
||||||
const list = lineMap.get(seg.lineIndex) || [];
|
|
||||||
list.push(seg);
|
|
||||||
lineMap.set(seg.lineIndex, list);
|
|
||||||
});
|
|
||||||
return Array.from(lineMap.entries())
|
|
||||||
.sort((a, b) => a[0] - b[0])
|
|
||||||
.map(([lineIndex, lineSegments]) => ({
|
|
||||||
key: `line-${lineIndex}`,
|
|
||||||
title: `第${lineIndex + 1}行`,
|
|
||||||
selectable: false,
|
|
||||||
children: lineSegments
|
|
||||||
.sort((a, b) => a.chunkIndex - b.chunkIndex)
|
|
||||||
.map((seg) => ({
|
|
||||||
key: `seg-${seg.idx}`,
|
|
||||||
title: (
|
|
||||||
<span className="flex items-center gap-1">
|
|
||||||
<span>{`片${seg.chunkIndex + 1}`}</span>
|
|
||||||
{seg.hasAnnotation && (
|
|
||||||
<CheckOutlined style={{ fontSize: 10, color: "#52c41a" }} />
|
|
||||||
)}
|
|
||||||
</span>
|
|
||||||
),
|
|
||||||
})),
|
|
||||||
}));
|
|
||||||
}, [segmented, segments]);
|
|
||||||
|
|
||||||
const segmentLineKeys = useMemo(
|
|
||||||
() => segmentTreeData.map((item) => String(item.key)),
|
|
||||||
[segmentTreeData]
|
|
||||||
);
|
|
||||||
|
|
||||||
const inProgressSegmentedCount = useMemo(() => {
|
|
||||||
if (tasks.length === 0) return 0;
|
|
||||||
return tasks.reduce((count, item) => {
|
|
||||||
const summary = resolveSegmentSummary(item);
|
|
||||||
if (!summary) return count;
|
|
||||||
return summary.done < summary.total ? count + 1 : count;
|
|
||||||
}, 0);
|
|
||||||
}, [tasks]);
|
|
||||||
|
|
||||||
const handleSegmentSelect = useCallback((keys: Array<string | number>) => {
|
|
||||||
const [first] = keys;
|
|
||||||
if (first === undefined || first === null) return;
|
|
||||||
const key = String(first);
|
|
||||||
if (!key.startsWith("seg-")) return;
|
|
||||||
const nextIndex = Number(key.replace("seg-", ""));
|
|
||||||
if (!Number.isFinite(nextIndex)) return;
|
|
||||||
handleSegmentChange(nextIndex);
|
|
||||||
}, [handleSegmentChange]);
|
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const handler = (event: MessageEvent<LsfMessage>) => {
|
const handler = (event: MessageEvent<LsfMessage>) => {
|
||||||
if (event.origin !== origin) return;
|
if (event.origin !== origin) return;
|
||||||
@@ -1148,7 +817,7 @@ export default function LabelStudioTextEditor() {
|
|||||||
|
|
||||||
const canLoadMore = taskTotalPages > 0 && taskPage + 1 < taskTotalPages;
|
const canLoadMore = taskTotalPages > 0 && taskPage + 1 < taskTotalPages;
|
||||||
const saveDisabled =
|
const saveDisabled =
|
||||||
!iframeReady || !selectedFileId || saving || segmentSwitching || loadingTaskDetail;
|
!iframeReady || !selectedFileId || saving || loadingTaskDetail;
|
||||||
const loadMoreNode = canLoadMore ? (
|
const loadMoreNode = canLoadMore ? (
|
||||||
<div className="p-2 text-center">
|
<div className="p-2 text-center">
|
||||||
<Button
|
<Button
|
||||||
@@ -1265,11 +934,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
>
|
>
|
||||||
<div className="px-3 py-2 border-b border-gray-200 bg-white font-medium text-sm flex items-center justify-between gap-2">
|
<div className="px-3 py-2 border-b border-gray-200 bg-white font-medium text-sm flex items-center justify-between gap-2">
|
||||||
<span>文件列表</span>
|
<span>文件列表</span>
|
||||||
{segmented && (
|
|
||||||
<Tag color="orange" style={{ margin: 0 }}>
|
|
||||||
标注中 {inProgressSegmentedCount}
|
|
||||||
</Tag>
|
|
||||||
)}
|
|
||||||
</div>
|
</div>
|
||||||
<div className="flex-1 min-h-0 overflow-auto">
|
<div className="flex-1 min-h-0 overflow-auto">
|
||||||
<List
|
<List
|
||||||
@@ -1278,7 +942,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
dataSource={tasks}
|
dataSource={tasks}
|
||||||
loadMore={loadMoreNode}
|
loadMore={loadMoreNode}
|
||||||
renderItem={(item) => {
|
renderItem={(item) => {
|
||||||
const segmentSummary = resolveSegmentSummary(item);
|
|
||||||
const statusMeta = resolveTaskStatusMeta(item);
|
const statusMeta = resolveTaskStatusMeta(item);
|
||||||
return (
|
return (
|
||||||
<List.Item
|
<List.Item
|
||||||
@@ -1300,11 +963,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
<Typography.Text type={statusMeta.type} style={{ fontSize: 11 }}>
|
<Typography.Text type={statusMeta.type} style={{ fontSize: 11 }}>
|
||||||
{statusMeta.text}
|
{statusMeta.text}
|
||||||
</Typography.Text>
|
</Typography.Text>
|
||||||
{segmentSummary && (
|
|
||||||
<Typography.Text type="secondary" style={{ fontSize: 10 }}>
|
|
||||||
已标注 {segmentSummary.done}/{segmentSummary.total}
|
|
||||||
</Typography.Text>
|
|
||||||
)}
|
|
||||||
</div>
|
</div>
|
||||||
{item.annotationUpdatedAt && (
|
{item.annotationUpdatedAt && (
|
||||||
<Typography.Text type="secondary" style={{ fontSize: 10 }}>
|
<Typography.Text type="secondary" style={{ fontSize: 10 }}>
|
||||||
@@ -1323,21 +981,28 @@ export default function LabelStudioTextEditor() {
|
|||||||
<div className="px-3 py-2 border-b border-gray-200 bg-gray-50 font-medium text-sm flex items-center justify-between">
|
<div className="px-3 py-2 border-b border-gray-200 bg-gray-50 font-medium text-sm flex items-center justify-between">
|
||||||
<span>段落/分段</span>
|
<span>段落/分段</span>
|
||||||
<Tag color="blue" style={{ margin: 0 }}>
|
<Tag color="blue" style={{ margin: 0 }}>
|
||||||
{currentSegmentIndex + 1} / {segments.length}
|
{segmentTotal > 0 ? currentSegmentIndex + 1 : 0} / {segmentTotal}
|
||||||
</Tag>
|
</Tag>
|
||||||
</div>
|
</div>
|
||||||
<div className="flex-1 min-h-0 overflow-auto px-2 py-2">
|
<div className="flex-1 min-h-0 overflow-auto px-2 py-2">
|
||||||
{segments.length > 0 ? (
|
{segmentTotal > 0 ? (
|
||||||
<Tree
|
<div className="grid grid-cols-[repeat(auto-fill,minmax(44px,1fr))] gap-1">
|
||||||
showLine
|
{segmentIndices.map((segmentIndex) => {
|
||||||
blockNode
|
const isCurrent = segmentIndex === currentSegmentIndex;
|
||||||
selectedKeys={
|
return (
|
||||||
segmented ? [`seg-${currentSegmentIndex}`] : []
|
<div
|
||||||
|
key={segmentIndex}
|
||||||
|
className={
|
||||||
|
isCurrent
|
||||||
|
? "h-7 leading-7 rounded bg-blue-500 text-white text-center text-xs font-medium"
|
||||||
|
: "h-7 leading-7 rounded bg-gray-100 text-gray-700 text-center text-xs"
|
||||||
}
|
}
|
||||||
expandedKeys={segmentLineKeys}
|
>
|
||||||
onSelect={handleSegmentSelect}
|
{segmentIndex + 1}
|
||||||
treeData={segmentTreeData}
|
</div>
|
||||||
/>
|
);
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
) : (
|
) : (
|
||||||
<div className="py-6">
|
<div className="py-6">
|
||||||
<Empty
|
<Empty
|
||||||
@@ -1347,17 +1012,6 @@ export default function LabelStudioTextEditor() {
|
|||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
<div className="px-3 py-2 border-t border-gray-200 flex items-center justify-between">
|
|
||||||
<Typography.Text style={{ fontSize: 12 }}>
|
|
||||||
切段自动保存
|
|
||||||
</Typography.Text>
|
|
||||||
<Switch
|
|
||||||
size="small"
|
|
||||||
checked={autoSaveOnSwitch}
|
|
||||||
onChange={(checked) => setAutoSaveOnSwitch(checked)}
|
|
||||||
disabled={segmentSwitching || saving || loadingTaskDetail || !lsReady}
|
|
||||||
/>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -57,6 +57,9 @@ export default function DataAnnotation() {
|
|||||||
const [selectedRowKeys, setSelectedRowKeys] = useState<AnnotationTaskRowKey[]>([]);
|
const [selectedRowKeys, setSelectedRowKeys] = useState<AnnotationTaskRowKey[]>([]);
|
||||||
const [selectedRows, setSelectedRows] = useState<AnnotationTaskListItem[]>([]);
|
const [selectedRows, setSelectedRows] = useState<AnnotationTaskListItem[]>([]);
|
||||||
|
|
||||||
|
const toSafeCount = (value: unknown) =>
|
||||||
|
typeof value === "number" && Number.isFinite(value) ? value : 0;
|
||||||
|
|
||||||
const handleAnnotate = (task: AnnotationTaskListItem) => {
|
const handleAnnotate = (task: AnnotationTaskListItem) => {
|
||||||
const projectId = task.id;
|
const projectId = task.id;
|
||||||
if (!projectId) {
|
if (!projectId) {
|
||||||
@@ -207,8 +210,20 @@ export default function DataAnnotation() {
|
|||||||
width: 100,
|
width: 100,
|
||||||
align: "center" as const,
|
align: "center" as const,
|
||||||
render: (value: number, record: AnnotationTaskListItem) => {
|
render: (value: number, record: AnnotationTaskListItem) => {
|
||||||
const total = record.totalCount || 0;
|
const total = toSafeCount(record.totalCount ?? record.total_count);
|
||||||
const annotated = value || 0;
|
const annotatedRaw = toSafeCount(
|
||||||
|
value ?? record.annotatedCount ?? record.annotated_count
|
||||||
|
);
|
||||||
|
const segmentationEnabled =
|
||||||
|
record.segmentationEnabled ?? record.segmentation_enabled;
|
||||||
|
const inProgressRaw = segmentationEnabled
|
||||||
|
? toSafeCount(record.inProgressCount ?? record.in_progress_count)
|
||||||
|
: 0;
|
||||||
|
const shouldExcludeInProgress =
|
||||||
|
total > 0 && annotatedRaw + inProgressRaw > total;
|
||||||
|
const annotated = shouldExcludeInProgress
|
||||||
|
? Math.max(annotatedRaw - inProgressRaw, 0)
|
||||||
|
: annotatedRaw;
|
||||||
const percent = total > 0 ? Math.round((annotated / total) * 100) : 0;
|
const percent = total > 0 ? Math.round((annotated / total) * 100) : 0;
|
||||||
return (
|
return (
|
||||||
<span title={`${annotated}/${total} (${percent}%)`}>
|
<span title={`${annotated}/${total} (${percent}%)`}>
|
||||||
|
|||||||
@@ -43,14 +43,6 @@ const TemplateDetail: React.FC<TemplateDetailProps> = ({
|
|||||||
<Descriptions.Item label="样式">
|
<Descriptions.Item label="样式">
|
||||||
{template.style}
|
{template.style}
|
||||||
</Descriptions.Item>
|
</Descriptions.Item>
|
||||||
<Descriptions.Item label="类型">
|
|
||||||
<Tag color={template.builtIn ? "gold" : "default"}>
|
|
||||||
{template.builtIn ? "系统内置" : "自定义"}
|
|
||||||
</Tag>
|
|
||||||
</Descriptions.Item>
|
|
||||||
<Descriptions.Item label="版本">
|
|
||||||
{template.version}
|
|
||||||
</Descriptions.Item>
|
|
||||||
<Descriptions.Item label="创建时间" span={2}>
|
<Descriptions.Item label="创建时间" span={2}>
|
||||||
{new Date(template.createdAt).toLocaleString()}
|
{new Date(template.createdAt).toLocaleString()}
|
||||||
</Descriptions.Item>
|
</Descriptions.Item>
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ const TemplateForm: React.FC<TemplateFormProps> = ({
|
|||||||
const [form] = Form.useForm();
|
const [form] = Form.useForm();
|
||||||
const [loading, setLoading] = useState(false);
|
const [loading, setLoading] = useState(false);
|
||||||
const [labelConfig, setLabelConfig] = useState("");
|
const [labelConfig, setLabelConfig] = useState("");
|
||||||
|
const selectedDataType = Form.useWatch("dataType", form);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (visible && template && mode === "edit") {
|
if (visible && template && mode === "edit") {
|
||||||
@@ -96,8 +97,12 @@ const TemplateForm: React.FC<TemplateFormProps> = ({
|
|||||||
} else {
|
} else {
|
||||||
message.error(response.message || `模板${mode === "create" ? "创建" : "更新"}失败`);
|
message.error(response.message || `模板${mode === "create" ? "创建" : "更新"}失败`);
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: unknown) {
|
||||||
if (error.errorFields) {
|
const hasErrorFields =
|
||||||
|
typeof error === "object" &&
|
||||||
|
error !== null &&
|
||||||
|
"errorFields" in error;
|
||||||
|
if (hasErrorFields) {
|
||||||
message.error("请填写所有必填字段");
|
message.error("请填写所有必填字段");
|
||||||
} else {
|
} else {
|
||||||
message.error(`模板${mode === "create" ? "创建" : "更新"}失败`);
|
message.error(`模板${mode === "create" ? "创建" : "更新"}失败`);
|
||||||
@@ -195,6 +200,7 @@ const TemplateForm: React.FC<TemplateFormProps> = ({
|
|||||||
value={labelConfig}
|
value={labelConfig}
|
||||||
onChange={setLabelConfig}
|
onChange={setLabelConfig}
|
||||||
height={420}
|
height={420}
|
||||||
|
dataType={selectedDataType}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</Form>
|
</Form>
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import React, { useState } from "react";
|
import React, { useState, useEffect } from "react";
|
||||||
import {
|
import {
|
||||||
Button,
|
Button,
|
||||||
Table,
|
Table,
|
||||||
@@ -32,7 +32,16 @@ import {
|
|||||||
TemplateTypeMap
|
TemplateTypeMap
|
||||||
} from "@/pages/DataAnnotation/annotation.const.tsx";
|
} from "@/pages/DataAnnotation/annotation.const.tsx";
|
||||||
|
|
||||||
|
const TEMPLATE_ADMIN_KEY = "datamate_template_admin";
|
||||||
|
|
||||||
const TemplateList: React.FC = () => {
|
const TemplateList: React.FC = () => {
|
||||||
|
const [isAdmin, setIsAdmin] = useState(false);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
// 检查 localStorage 中是否存在特殊键
|
||||||
|
const hasAdminKey = localStorage.getItem(TEMPLATE_ADMIN_KEY) !== null;
|
||||||
|
setIsAdmin(hasAdminKey);
|
||||||
|
}, []);
|
||||||
const filterOptions = [
|
const filterOptions = [
|
||||||
{
|
{
|
||||||
key: "category",
|
key: "category",
|
||||||
@@ -225,23 +234,7 @@ const TemplateList: React.FC = () => {
|
|||||||
<Tag color={getCategoryColor(category)}>{ClassificationMap[category as keyof typeof ClassificationMap]?.label || category}</Tag>
|
<Tag color={getCategoryColor(category)}>{ClassificationMap[category as keyof typeof ClassificationMap]?.label || category}</Tag>
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
{
|
|
||||||
title: "类型",
|
|
||||||
dataIndex: "builtIn",
|
|
||||||
key: "builtIn",
|
|
||||||
width: 100,
|
|
||||||
render: (builtIn: boolean) => (
|
|
||||||
<Tag color={builtIn ? "gold" : "default"}>
|
|
||||||
{builtIn ? "系统内置" : "自定义"}
|
|
||||||
</Tag>
|
|
||||||
),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "版本",
|
|
||||||
dataIndex: "version",
|
|
||||||
key: "version",
|
|
||||||
width: 80,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
title: "创建时间",
|
title: "创建时间",
|
||||||
dataIndex: "createdAt",
|
dataIndex: "createdAt",
|
||||||
@@ -263,6 +256,7 @@ const TemplateList: React.FC = () => {
|
|||||||
onClick={() => handleView(record)}
|
onClick={() => handleView(record)}
|
||||||
/>
|
/>
|
||||||
</Tooltip>
|
</Tooltip>
|
||||||
|
{isAdmin && (
|
||||||
<>
|
<>
|
||||||
<Tooltip title="编辑">
|
<Tooltip title="编辑">
|
||||||
<Button
|
<Button
|
||||||
@@ -286,6 +280,7 @@ const TemplateList: React.FC = () => {
|
|||||||
</Tooltip>
|
</Tooltip>
|
||||||
</Popconfirm>
|
</Popconfirm>
|
||||||
</>
|
</>
|
||||||
|
)}
|
||||||
</Space>
|
</Space>
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
@@ -310,11 +305,13 @@ const TemplateList: React.FC = () => {
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* Right side: Create button */}
|
{/* Right side: Create button */}
|
||||||
|
{isAdmin && (
|
||||||
<div className="flex items-center gap-2">
|
<div className="flex items-center gap-2">
|
||||||
<Button type="primary" icon={<PlusOutlined />} onClick={handleCreate}>
|
<Button type="primary" icon={<PlusOutlined />} onClick={handleCreate}>
|
||||||
创建模板
|
创建模板
|
||||||
</Button>
|
</Button>
|
||||||
</div>
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<Card>
|
<Card>
|
||||||
|
|||||||
@@ -3,16 +3,19 @@ import { get, post, put, del, download } from "@/utils/request";
|
|||||||
// 导出格式类型
|
// 导出格式类型
|
||||||
export type ExportFormat = "json" | "jsonl" | "csv" | "coco" | "yolo";
|
export type ExportFormat = "json" | "jsonl" | "csv" | "coco" | "yolo";
|
||||||
|
|
||||||
|
type RequestParams = Record<string, unknown>;
|
||||||
|
type RequestPayload = Record<string, unknown>;
|
||||||
|
|
||||||
// 标注任务管理相关接口
|
// 标注任务管理相关接口
|
||||||
export function queryAnnotationTasksUsingGet(params?: any) {
|
export function queryAnnotationTasksUsingGet(params?: RequestParams) {
|
||||||
return get("/api/annotation/project", params);
|
return get("/api/annotation/project", params);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function createAnnotationTaskUsingPost(data: any) {
|
export function createAnnotationTaskUsingPost(data: RequestPayload) {
|
||||||
return post("/api/annotation/project", data);
|
return post("/api/annotation/project", data);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function syncAnnotationTaskUsingPost(data: any) {
|
export function syncAnnotationTaskUsingPost(data: RequestPayload) {
|
||||||
return post(`/api/annotation/task/sync`, data);
|
return post(`/api/annotation/task/sync`, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,7 +28,7 @@ export function getAnnotationTaskByIdUsingGet(taskId: string) {
|
|||||||
return get(`/api/annotation/project/${taskId}`);
|
return get(`/api/annotation/project/${taskId}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function updateAnnotationTaskByIdUsingPut(taskId: string, data: any) {
|
export function updateAnnotationTaskByIdUsingPut(taskId: string, data: RequestPayload) {
|
||||||
return put(`/api/annotation/project/${taskId}`, data);
|
return put(`/api/annotation/project/${taskId}`, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -35,17 +38,17 @@ export function getTagConfigUsingGet() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 标注模板管理
|
// 标注模板管理
|
||||||
export function queryAnnotationTemplatesUsingGet(params?: any) {
|
export function queryAnnotationTemplatesUsingGet(params?: RequestParams) {
|
||||||
return get("/api/annotation/template", params);
|
return get("/api/annotation/template", params);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function createAnnotationTemplateUsingPost(data: any) {
|
export function createAnnotationTemplateUsingPost(data: RequestPayload) {
|
||||||
return post("/api/annotation/template", data);
|
return post("/api/annotation/template", data);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function updateAnnotationTemplateByIdUsingPut(
|
export function updateAnnotationTemplateByIdUsingPut(
|
||||||
templateId: string | number,
|
templateId: string | number,
|
||||||
data: any
|
data: RequestPayload
|
||||||
) {
|
) {
|
||||||
return put(`/api/annotation/template/${templateId}`, data);
|
return put(`/api/annotation/template/${templateId}`, data);
|
||||||
}
|
}
|
||||||
@@ -65,7 +68,7 @@ export function getEditorProjectInfoUsingGet(projectId: string) {
|
|||||||
return get(`/api/annotation/editor/projects/${projectId}`);
|
return get(`/api/annotation/editor/projects/${projectId}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function listEditorTasksUsingGet(projectId: string, params?: any) {
|
export function listEditorTasksUsingGet(projectId: string, params?: RequestParams) {
|
||||||
return get(`/api/annotation/editor/projects/${projectId}/tasks`, params);
|
return get(`/api/annotation/editor/projects/${projectId}/tasks`, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,11 +80,19 @@ export function getEditorTaskUsingGet(
|
|||||||
return get(`/api/annotation/editor/projects/${projectId}/tasks/${fileId}`, params);
|
return get(`/api/annotation/editor/projects/${projectId}/tasks/${fileId}`, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function getEditorTaskSegmentUsingGet(
|
||||||
|
projectId: string,
|
||||||
|
fileId: string,
|
||||||
|
params: { segmentIndex: number }
|
||||||
|
) {
|
||||||
|
return get(`/api/annotation/editor/projects/${projectId}/tasks/${fileId}/segments`, params);
|
||||||
|
}
|
||||||
|
|
||||||
export function upsertEditorAnnotationUsingPut(
|
export function upsertEditorAnnotationUsingPut(
|
||||||
projectId: string,
|
projectId: string,
|
||||||
fileId: string,
|
fileId: string,
|
||||||
data: {
|
data: {
|
||||||
annotation: any;
|
annotation: Record<string, unknown>;
|
||||||
expectedUpdatedAt?: string;
|
expectedUpdatedAt?: string;
|
||||||
segmentIndex?: number;
|
segmentIndex?: number;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import {
|
|||||||
getObjectDisplayName,
|
getObjectDisplayName,
|
||||||
type LabelStudioTagConfig,
|
type LabelStudioTagConfig,
|
||||||
} from "../annotation.tagconfig";
|
} from "../annotation.tagconfig";
|
||||||
|
import { DataType } from "../annotation.model";
|
||||||
|
|
||||||
const { Text, Title } = Typography;
|
const { Text, Title } = Typography;
|
||||||
|
|
||||||
@@ -44,10 +45,22 @@ interface TemplateConfigurationTreeEditorProps {
|
|||||||
readOnly?: boolean;
|
readOnly?: boolean;
|
||||||
readOnlyStructure?: boolean;
|
readOnlyStructure?: boolean;
|
||||||
height?: number | string;
|
height?: number | string;
|
||||||
|
dataType?: DataType;
|
||||||
}
|
}
|
||||||
|
|
||||||
const DEFAULT_ROOT_TAG = "View";
|
const DEFAULT_ROOT_TAG = "View";
|
||||||
const CHILD_TAGS = ["Label", "Choice", "Relation", "Item", "Path", "Channel"];
|
const CHILD_TAGS = ["Label", "Choice", "Relation", "Item", "Path", "Channel"];
|
||||||
|
const OBJECT_TAGS_BY_DATA_TYPE: Record<DataType, string[]> = {
|
||||||
|
[DataType.TEXT]: ["Text", "Paragraphs", "Markdown"],
|
||||||
|
[DataType.IMAGE]: ["Image", "Bitmask"],
|
||||||
|
[DataType.AUDIO]: ["Audio", "AudioPlus"],
|
||||||
|
[DataType.VIDEO]: ["Video"],
|
||||||
|
[DataType.PDF]: ["PDF"],
|
||||||
|
[DataType.TIMESERIES]: ["Timeseries", "TimeSeries", "Vector"],
|
||||||
|
[DataType.CHAT]: ["Chat"],
|
||||||
|
[DataType.HTML]: ["HyperText", "Markdown"],
|
||||||
|
[DataType.TABLE]: ["Table", "Vector"],
|
||||||
|
};
|
||||||
|
|
||||||
const createId = () =>
|
const createId = () =>
|
||||||
`node_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 8)}`;
|
`node_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 8)}`;
|
||||||
@@ -247,19 +260,35 @@ const createNode = (
|
|||||||
attrs[attr] = "";
|
attrs[attr] = "";
|
||||||
});
|
});
|
||||||
|
|
||||||
if (objectConfig && attrs.name !== undefined) {
|
if (objectConfig) {
|
||||||
const name = getDefaultName(tag);
|
const name = getDefaultName(tag);
|
||||||
|
if (!attrs.name) {
|
||||||
attrs.name = name;
|
attrs.name = name;
|
||||||
if (attrs.value !== undefined) {
|
}
|
||||||
attrs.value = `$${name}`;
|
if (!attrs.value) {
|
||||||
|
attrs.value = `$${attrs.name}`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (controlConfig && attrs.name !== undefined) {
|
if (controlConfig) {
|
||||||
|
const isLabeling = controlConfig.category === "labeling";
|
||||||
|
|
||||||
|
if (isLabeling) {
|
||||||
|
if (!attrs.name) {
|
||||||
attrs.name = getDefaultName(tag);
|
attrs.name = getDefaultName(tag);
|
||||||
if (attrs.toName !== undefined) {
|
}
|
||||||
|
if (!attrs.toName) {
|
||||||
attrs.toName = objectNames[0] || "";
|
attrs.toName = objectNames[0] || "";
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// For layout controls, only fill if required
|
||||||
|
if (attrs.name !== undefined && !attrs.name) {
|
||||||
|
attrs.name = getDefaultName(tag);
|
||||||
|
}
|
||||||
|
if (attrs.toName !== undefined && !attrs.toName) {
|
||||||
|
attrs.toName = objectNames[0] || "";
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CHILD_TAGS.includes(tag)) {
|
if (CHILD_TAGS.includes(tag)) {
|
||||||
@@ -420,14 +449,13 @@ const TemplateConfigurationTreeEditor = ({
|
|||||||
readOnly = false,
|
readOnly = false,
|
||||||
readOnlyStructure = false,
|
readOnlyStructure = false,
|
||||||
height = 420,
|
height = 420,
|
||||||
|
dataType,
|
||||||
}: TemplateConfigurationTreeEditorProps) => {
|
}: TemplateConfigurationTreeEditorProps) => {
|
||||||
const { config } = useTagConfig(false);
|
const { config } = useTagConfig(false);
|
||||||
const [tree, setTree] = useState<XmlNode>(() => createEmptyTree());
|
const [tree, setTree] = useState<XmlNode>(() => createEmptyTree());
|
||||||
const [selectedId, setSelectedId] = useState<string>(tree.id);
|
const [selectedId, setSelectedId] = useState<string>(tree.id);
|
||||||
const [parseError, setParseError] = useState<string | null>(null);
|
const [parseError, setParseError] = useState<string | null>(null);
|
||||||
const lastSerialized = useRef<string>("");
|
const lastSerialized = useRef<string>("");
|
||||||
const [addChildTag, setAddChildTag] = useState<string | undefined>();
|
|
||||||
const [addSiblingTag, setAddSiblingTag] = useState<string | undefined>();
|
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!value) {
|
if (!value) {
|
||||||
@@ -498,11 +526,17 @@ const TemplateConfigurationTreeEditor = ({
|
|||||||
|
|
||||||
const objectOptions = useMemo(() => {
|
const objectOptions = useMemo(() => {
|
||||||
if (!config?.objects) return [];
|
if (!config?.objects) return [];
|
||||||
return Object.keys(config.objects).map((tag) => ({
|
const options = Object.keys(config.objects).map((tag) => ({
|
||||||
value: tag,
|
value: tag,
|
||||||
label: getObjectDisplayName(tag),
|
label: getObjectDisplayName(tag),
|
||||||
}));
|
}));
|
||||||
}, [config]);
|
if (!dataType) return options;
|
||||||
|
const allowedTags = OBJECT_TAGS_BY_DATA_TYPE[dataType];
|
||||||
|
if (!allowedTags) return options;
|
||||||
|
const allowedSet = new Set(allowedTags);
|
||||||
|
const filtered = options.filter((option) => allowedSet.has(option.value));
|
||||||
|
return filtered.length > 0 ? filtered : options;
|
||||||
|
}, [config, dataType]);
|
||||||
|
|
||||||
const tagOptions = useMemo(() => {
|
const tagOptions = useMemo(() => {
|
||||||
const options = [] as {
|
const options = [] as {
|
||||||
@@ -763,9 +797,8 @@ const TemplateConfigurationTreeEditor = ({
|
|||||||
<Select
|
<Select
|
||||||
placeholder="添加子节点"
|
placeholder="添加子节点"
|
||||||
options={tagOptions}
|
options={tagOptions}
|
||||||
value={addChildTag}
|
value={null}
|
||||||
onChange={(value) => {
|
onChange={(value) => {
|
||||||
setAddChildTag(undefined);
|
|
||||||
handleAddNode(value, "child");
|
handleAddNode(value, "child");
|
||||||
}}
|
}}
|
||||||
disabled={isStructureLocked}
|
disabled={isStructureLocked}
|
||||||
@@ -773,9 +806,8 @@ const TemplateConfigurationTreeEditor = ({
|
|||||||
<Select
|
<Select
|
||||||
placeholder="添加同级节点"
|
placeholder="添加同级节点"
|
||||||
options={tagOptions}
|
options={tagOptions}
|
||||||
value={addSiblingTag}
|
value={null}
|
||||||
onChange={(value) => {
|
onChange={(value) => {
|
||||||
setAddSiblingTag(undefined);
|
|
||||||
handleAddNode(value, "sibling");
|
handleAddNode(value, "sibling");
|
||||||
}}
|
}}
|
||||||
disabled={isStructureLocked || selectedNode.id === tree.id}
|
disabled={isStructureLocked || selectedNode.id === tree.id}
|
||||||
|
|||||||
@@ -4,14 +4,9 @@ import { ArrowLeft } from "lucide-react";
|
|||||||
import { Button, Form, App } from "antd";
|
import { Button, Form, App } from "antd";
|
||||||
import { Link, useLocation, useNavigate } from "react-router";
|
import { Link, useLocation, useNavigate } from "react-router";
|
||||||
import { createDatasetUsingPost } from "../dataset.api";
|
import { createDatasetUsingPost } from "../dataset.api";
|
||||||
import { datasetTypes } from "../dataset.const";
|
|
||||||
import { DatasetType } from "../dataset.model";
|
import { DatasetType } from "../dataset.model";
|
||||||
import BasicInformation from "./components/BasicInformation";
|
import BasicInformation from "./components/BasicInformation";
|
||||||
|
|
||||||
const textDatasetTypeOptions = datasetTypes.filter(
|
|
||||||
(type) => type.value === DatasetType.TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
export default function DatasetCreate() {
|
export default function DatasetCreate() {
|
||||||
const navigate = useNavigate();
|
const navigate = useNavigate();
|
||||||
const location = useLocation();
|
const location = useLocation();
|
||||||
@@ -87,7 +82,6 @@ export default function DatasetCreate() {
|
|||||||
data={newDataset}
|
data={newDataset}
|
||||||
setData={setNewDataset}
|
setData={setNewDataset}
|
||||||
hidden={["dataSource"]}
|
hidden={["dataSource"]}
|
||||||
datasetTypeOptions={textDatasetTypeOptions}
|
|
||||||
/>
|
/>
|
||||||
</Form>
|
</Form>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import { Dataset, DatasetType, DataSource } from "../../dataset.model";
|
|||||||
import { useCallback, useEffect, useMemo, useState } from "react";
|
import { useCallback, useEffect, useMemo, useState } from "react";
|
||||||
import { queryTasksUsingGet } from "@/pages/DataCollection/collection.apis";
|
import { queryTasksUsingGet } from "@/pages/DataCollection/collection.apis";
|
||||||
import { updateDatasetByIdUsingPut } from "../../dataset.api";
|
import { updateDatasetByIdUsingPut } from "../../dataset.api";
|
||||||
import { sliceFile } from "@/utils/file.util";
|
import { sliceFile, shouldStreamUpload } from "@/utils/file.util";
|
||||||
import Dragger from "antd/es/upload/Dragger";
|
import Dragger from "antd/es/upload/Dragger";
|
||||||
|
|
||||||
const TEXT_FILE_MIME_PREFIX = "text/";
|
const TEXT_FILE_MIME_PREFIX = "text/";
|
||||||
@@ -90,14 +90,16 @@ async function splitFileByLines(file: UploadFile): Promise<UploadFile[]> {
|
|||||||
const lines = text.split(/\r?\n/).filter((line: string) => line.trim() !== "");
|
const lines = text.split(/\r?\n/).filter((line: string) => line.trim() !== "");
|
||||||
if (lines.length === 0) return [];
|
if (lines.length === 0) return [];
|
||||||
|
|
||||||
// 生成文件名:原文件名_序号.扩展名
|
// 生成文件名:原文件名_序号(不保留后缀)
|
||||||
const nameParts = file.name.split(".");
|
const nameParts = file.name.split(".");
|
||||||
const ext = nameParts.length > 1 ? "." + nameParts.pop() : "";
|
if (nameParts.length > 1) {
|
||||||
|
nameParts.pop();
|
||||||
|
}
|
||||||
const baseName = nameParts.join(".");
|
const baseName = nameParts.join(".");
|
||||||
const padLength = String(lines.length).length;
|
const padLength = String(lines.length).length;
|
||||||
|
|
||||||
return lines.map((line: string, index: number) => {
|
return lines.map((line: string, index: number) => {
|
||||||
const newFileName = `${baseName}_${String(index + 1).padStart(padLength, "0")}${ext}`;
|
const newFileName = `${baseName}_${String(index + 1).padStart(padLength, "0")}`;
|
||||||
const blob = new Blob([line], { type: "text/plain" });
|
const blob = new Blob([line], { type: "text/plain" });
|
||||||
const newFile = new File([blob], newFileName, { type: "text/plain" });
|
const newFile = new File([blob], newFileName, { type: "text/plain" });
|
||||||
return {
|
return {
|
||||||
@@ -164,17 +166,75 @@ export default function ImportConfiguration({
|
|||||||
// 本地上传文件相关逻辑
|
// 本地上传文件相关逻辑
|
||||||
|
|
||||||
const handleUpload = async (dataset: Dataset) => {
|
const handleUpload = async (dataset: Dataset) => {
|
||||||
let filesToUpload =
|
const filesToUpload =
|
||||||
(form.getFieldValue("files") as UploadFile[] | undefined) || [];
|
(form.getFieldValue("files") as UploadFile[] | undefined) || [];
|
||||||
|
|
||||||
// 如果启用分行分割,处理文件
|
// 如果启用分行分割,对大文件使用流式处理
|
||||||
if (importConfig.splitByLine && !hasNonTextFile) {
|
if (importConfig.splitByLine && !hasNonTextFile) {
|
||||||
const splitResults = await Promise.all(
|
// 检查是否有大文件需要流式分割上传
|
||||||
filesToUpload.map((file) => splitFileByLines(file))
|
const filesForStreamUpload: File[] = [];
|
||||||
);
|
const filesForNormalUpload: UploadFile[] = [];
|
||||||
filesToUpload = splitResults.flat();
|
|
||||||
|
for (const file of filesToUpload) {
|
||||||
|
const originFile = file.originFileObj ?? file;
|
||||||
|
if (originFile instanceof File && shouldStreamUpload(originFile)) {
|
||||||
|
filesForStreamUpload.push(originFile);
|
||||||
|
} else {
|
||||||
|
filesForNormalUpload.push(file);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 大文件使用流式分割上传
|
||||||
|
if (filesForStreamUpload.length > 0) {
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("upload:dataset-stream", {
|
||||||
|
detail: {
|
||||||
|
dataset,
|
||||||
|
files: filesForStreamUpload,
|
||||||
|
updateEvent,
|
||||||
|
hasArchive: importConfig.hasArchive,
|
||||||
|
prefix: currentPrefix,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 小文件使用传统分割方式
|
||||||
|
if (filesForNormalUpload.length > 0) {
|
||||||
|
const splitResults = await Promise.all(
|
||||||
|
filesForNormalUpload.map((file) => splitFileByLines(file))
|
||||||
|
);
|
||||||
|
const smallFilesToUpload = splitResults.flat();
|
||||||
|
|
||||||
|
// 计算分片列表
|
||||||
|
const sliceList = smallFilesToUpload.map((file) => {
|
||||||
|
const originFile = (file.originFileObj ?? file) as Blob;
|
||||||
|
const slices = sliceFile(originFile);
|
||||||
|
return {
|
||||||
|
originFile: originFile,
|
||||||
|
slices,
|
||||||
|
name: file.name,
|
||||||
|
size: originFile.size || 0,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log("[ImportConfiguration] Uploading small files with currentPrefix:", currentPrefix);
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("upload:dataset", {
|
||||||
|
detail: {
|
||||||
|
dataset,
|
||||||
|
files: sliceList,
|
||||||
|
updateEvent,
|
||||||
|
hasArchive: importConfig.hasArchive,
|
||||||
|
prefix: currentPrefix,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 未启用分行分割,使用普通上传
|
||||||
// 计算分片列表
|
// 计算分片列表
|
||||||
const sliceList = filesToUpload.map((file) => {
|
const sliceList = filesToUpload.map((file) => {
|
||||||
const originFile = (file.originFileObj ?? file) as Blob;
|
const originFile = (file.originFileObj ?? file) as Blob;
|
||||||
@@ -234,6 +294,10 @@ export default function ImportConfiguration({
|
|||||||
if (!data) return;
|
if (!data) return;
|
||||||
console.log('[ImportConfiguration] handleImportData called, currentPrefix:', currentPrefix);
|
console.log('[ImportConfiguration] handleImportData called, currentPrefix:', currentPrefix);
|
||||||
if (importConfig.source === DataSource.UPLOAD) {
|
if (importConfig.source === DataSource.UPLOAD) {
|
||||||
|
// 立即显示任务中心,让用户感知上传已开始(在文件分割等耗时操作之前)
|
||||||
|
window.dispatchEvent(
|
||||||
|
new CustomEvent("show:task-popover", { detail: { show: true } })
|
||||||
|
);
|
||||||
await handleUpload(data);
|
await handleUpload(data);
|
||||||
} else if (importConfig.source === DataSource.COLLECTION) {
|
} else if (importConfig.source === DataSource.COLLECTION) {
|
||||||
await updateDatasetByIdUsingPut(data.id, {
|
await updateDatasetByIdUsingPut(data.id, {
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ import type {
|
|||||||
Dataset,
|
Dataset,
|
||||||
DatasetFile,
|
DatasetFile,
|
||||||
} from "@/pages/DataManagement/dataset.model";
|
} from "@/pages/DataManagement/dataset.model";
|
||||||
import { DatasetType } from "@/pages/DataManagement/dataset.model";
|
|
||||||
import { App } from "antd";
|
import { App } from "antd";
|
||||||
import { useCallback, useEffect, useRef, useState } from "react";
|
import { useCallback, useEffect, useRef, useState } from "react";
|
||||||
import {
|
import {
|
||||||
@@ -25,7 +24,6 @@ import {
|
|||||||
import { useParams } from "react-router";
|
import { useParams } from "react-router";
|
||||||
|
|
||||||
const OFFICE_FILE_EXTENSIONS = [".doc", ".docx"];
|
const OFFICE_FILE_EXTENSIONS = [".doc", ".docx"];
|
||||||
const TEXT_DATASET_TYPE_PREFIX = DatasetType.TEXT;
|
|
||||||
const OFFICE_PREVIEW_POLL_INTERVAL = 2000;
|
const OFFICE_PREVIEW_POLL_INTERVAL = 2000;
|
||||||
const OFFICE_PREVIEW_POLL_MAX_TIMES = 60;
|
const OFFICE_PREVIEW_POLL_MAX_TIMES = 60;
|
||||||
|
|
||||||
@@ -87,13 +85,6 @@ export function useFilesOperation(dataset: Dataset) {
|
|||||||
};
|
};
|
||||||
}, [clearOfficePreviewPolling]);
|
}, [clearOfficePreviewPolling]);
|
||||||
|
|
||||||
const isTextDataset = (datasetType?: string) => {
|
|
||||||
if (!datasetType) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return datasetType.toUpperCase().startsWith(TEXT_DATASET_TYPE_PREFIX);
|
|
||||||
};
|
|
||||||
|
|
||||||
const fetchFiles = async (
|
const fetchFiles = async (
|
||||||
prefix?: string,
|
prefix?: string,
|
||||||
current?: number,
|
current?: number,
|
||||||
@@ -101,14 +92,13 @@ export function useFilesOperation(dataset: Dataset) {
|
|||||||
) => {
|
) => {
|
||||||
// 如果明确传了 prefix(包括空字符串),使用传入的值;否则使用当前 pagination.prefix
|
// 如果明确传了 prefix(包括空字符串),使用传入的值;否则使用当前 pagination.prefix
|
||||||
const targetPrefix = prefix !== undefined ? prefix : (pagination.prefix || '');
|
const targetPrefix = prefix !== undefined ? prefix : (pagination.prefix || '');
|
||||||
const shouldExcludeDerivedFiles = isTextDataset(dataset?.datasetType);
|
|
||||||
|
|
||||||
const params: DatasetFilesQueryParams = {
|
const params: DatasetFilesQueryParams = {
|
||||||
page: current !== undefined ? current : pagination.current,
|
page: current !== undefined ? current : pagination.current,
|
||||||
size: pageSize !== undefined ? pageSize : pagination.pageSize,
|
size: pageSize !== undefined ? pageSize : pagination.pageSize,
|
||||||
isWithDirectory: true,
|
isWithDirectory: true,
|
||||||
prefix: targetPrefix,
|
prefix: targetPrefix,
|
||||||
...(shouldExcludeDerivedFiles ? { excludeDerivedFiles: true } : {}),
|
excludeDerivedFiles: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
const { data } = await queryDatasetFilesUsingGet(id!, params);
|
const { data } = await queryDatasetFilesUsingGet(id!, params);
|
||||||
|
|||||||
@@ -102,6 +102,13 @@ export interface DatasetTask {
|
|||||||
executionHistory?: { time: string; status: string }[];
|
executionHistory?: { time: string; status: string }[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface StreamUploadInfo {
|
||||||
|
currentFile: string;
|
||||||
|
fileIndex: number;
|
||||||
|
totalFiles: number;
|
||||||
|
uploadedLines: number;
|
||||||
|
}
|
||||||
|
|
||||||
export interface TaskItem {
|
export interface TaskItem {
|
||||||
key: string;
|
key: string;
|
||||||
title: string;
|
title: string;
|
||||||
@@ -113,4 +120,6 @@ export interface TaskItem {
|
|||||||
updateEvent?: string;
|
updateEvent?: string;
|
||||||
size?: number;
|
size?: number;
|
||||||
hasArchive?: boolean;
|
hasArchive?: boolean;
|
||||||
|
prefix?: string;
|
||||||
|
streamUploadInfo?: StreamUploadInfo;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,7 +28,12 @@ import {
|
|||||||
queryKnowledgeItemsUsingGet,
|
queryKnowledgeItemsUsingGet,
|
||||||
queryKnowledgeItemPreviewStatusUsingGet,
|
queryKnowledgeItemPreviewStatusUsingGet,
|
||||||
queryKnowledgeSetByIdUsingGet,
|
queryKnowledgeSetByIdUsingGet,
|
||||||
|
updateKnowledgeSetByIdUsingPut,
|
||||||
} from "../knowledge-management.api";
|
} from "../knowledge-management.api";
|
||||||
|
import {
|
||||||
|
createDatasetTagUsingPost,
|
||||||
|
queryDatasetTagsUsingGet,
|
||||||
|
} from "../../DataManagement/dataset.api";
|
||||||
import {
|
import {
|
||||||
knowledgeContentTypeOptions,
|
knowledgeContentTypeOptions,
|
||||||
knowledgeSourceTypeOptions,
|
knowledgeSourceTypeOptions,
|
||||||
@@ -48,7 +53,7 @@ import CreateKnowledgeSet from "../components/CreateKnowledgeSet";
|
|||||||
import KnowledgeItemEditor from "../components/KnowledgeItemEditor";
|
import KnowledgeItemEditor from "../components/KnowledgeItemEditor";
|
||||||
import ImportKnowledgeItemsDialog from "../components/ImportKnowledgeItemsDialog";
|
import ImportKnowledgeItemsDialog from "../components/ImportKnowledgeItemsDialog";
|
||||||
import { formatDate } from "@/utils/unit";
|
import { formatDate } from "@/utils/unit";
|
||||||
import { File, Folder } from "lucide-react";
|
import { File, Folder, Clock } from "lucide-react";
|
||||||
import {
|
import {
|
||||||
PREVIEW_TEXT_MAX_LENGTH,
|
PREVIEW_TEXT_MAX_LENGTH,
|
||||||
resolvePreviewFileType,
|
resolvePreviewFileType,
|
||||||
@@ -70,6 +75,30 @@ const OFFICE_PREVIEW_POLL_MAX_TIMES = 60;
|
|||||||
|
|
||||||
type OfficePreviewStatus = "UNSET" | "PENDING" | "PROCESSING" | "READY" | "FAILED";
|
type OfficePreviewStatus = "UNSET" | "PENDING" | "PROCESSING" | "READY" | "FAILED";
|
||||||
|
|
||||||
|
const parseMetadata = (value?: string | Record<string, unknown>) => {
|
||||||
|
if (!value) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (typeof value === "object") {
|
||||||
|
return value as Record<string, unknown>;
|
||||||
|
}
|
||||||
|
if (typeof value !== "string") {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(value);
|
||||||
|
return parsed && typeof parsed === "object" ? (parsed as Record<string, unknown>) : null;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const isAnnotationItem = (record: KnowledgeItemView) => {
|
||||||
|
const metadata = parseMetadata(record.metadata);
|
||||||
|
const source = metadata && typeof metadata === "object" ? (metadata as { source?: { type?: string } }).source : null;
|
||||||
|
return source?.type === "annotation";
|
||||||
|
};
|
||||||
|
|
||||||
const isOfficeFileName = (fileName?: string) => {
|
const isOfficeFileName = (fileName?: string) => {
|
||||||
const lowerName = (fileName || "").toLowerCase();
|
const lowerName = (fileName || "").toLowerCase();
|
||||||
return OFFICE_FILE_EXTENSIONS.some((ext) => lowerName.endsWith(ext));
|
return OFFICE_FILE_EXTENSIONS.some((ext) => lowerName.endsWith(ext));
|
||||||
@@ -400,7 +429,7 @@ const KnowledgeSetDetail = () => {
|
|||||||
if (currentStatus === "FAILED") {
|
if (currentStatus === "FAILED") {
|
||||||
setOfficePreviewStatus("PROCESSING");
|
setOfficePreviewStatus("PROCESSING");
|
||||||
}
|
}
|
||||||
if (currentStatus === "PROCESSING" || currentStatus === "PENDING") {
|
if (currentStatus === "PROCESSING") {
|
||||||
pollOfficePreviewStatus(id, record.id, 0);
|
pollOfficePreviewStatus(id, record.id, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -483,7 +512,7 @@ const KnowledgeSetDetail = () => {
|
|||||||
setReadItemId(record.id);
|
setReadItemId(record.id);
|
||||||
setReadTitle("知识条目");
|
setReadTitle("知识条目");
|
||||||
|
|
||||||
if (!record.sourceDatasetId || !record.sourceFileId) {
|
if (!record.sourceDatasetId || !record.sourceFileId || isAnnotationItem(record)) {
|
||||||
const content = record.content || "";
|
const content = record.content || "";
|
||||||
setReadContent(truncatePreviewText(content, PREVIEW_TEXT_MAX_LENGTH));
|
setReadContent(truncatePreviewText(content, PREVIEW_TEXT_MAX_LENGTH));
|
||||||
setReadModalOpen(true);
|
setReadModalOpen(true);
|
||||||
@@ -527,14 +556,12 @@ const KnowledgeSetDetail = () => {
|
|||||||
() => [
|
() => [
|
||||||
{
|
{
|
||||||
key: "items",
|
key: "items",
|
||||||
icon: <PlusOutlined className="text-blue-500" />,
|
icon: <File className="text-blue-400 w-4 h-4" />,
|
||||||
label: "条目数",
|
|
||||||
value: allItems.length,
|
value: allItems.length,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
key: "updated",
|
key: "updated",
|
||||||
icon: <EditOutlined className="text-gray-500" />,
|
icon: <Clock className="text-blue-400 w-4 h-4" />,
|
||||||
label: "更新时间",
|
|
||||||
value: knowledgeSet?.updatedAt ? formatDate(knowledgeSet.updatedAt) : "-",
|
value: knowledgeSet?.updatedAt ? formatDate(knowledgeSet.updatedAt) : "-",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
@@ -916,6 +943,60 @@ const KnowledgeSetDetail = () => {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
]}
|
]}
|
||||||
|
tagConfig={{
|
||||||
|
showAdd: true,
|
||||||
|
tags: knowledgeSet?.tags || [],
|
||||||
|
onFetchTags: async () => {
|
||||||
|
const res = await queryDatasetTagsUsingGet({
|
||||||
|
page: 0,
|
||||||
|
pageSize: 1000,
|
||||||
|
});
|
||||||
|
return res.data || [];
|
||||||
|
},
|
||||||
|
onCreateAndTag: async (tagName) => {
|
||||||
|
const res = await createDatasetTagUsingPost({ name: tagName });
|
||||||
|
if (res.data && knowledgeSet) {
|
||||||
|
const currentTags = knowledgeSet.tags || [];
|
||||||
|
await updateKnowledgeSetByIdUsingPut(knowledgeSet.id, {
|
||||||
|
name: knowledgeSet.name,
|
||||||
|
description: knowledgeSet.description,
|
||||||
|
status: knowledgeSet.status,
|
||||||
|
domain: knowledgeSet.domain,
|
||||||
|
businessLine: knowledgeSet.businessLine,
|
||||||
|
owner: knowledgeSet.owner,
|
||||||
|
validFrom: knowledgeSet.validFrom,
|
||||||
|
validTo: knowledgeSet.validTo,
|
||||||
|
sourceType: knowledgeSet.sourceType,
|
||||||
|
sensitivity: knowledgeSet.sensitivity,
|
||||||
|
metadata: knowledgeSet.metadata,
|
||||||
|
tags: [...currentTags.map((tag) => tag.name), res.data.name],
|
||||||
|
});
|
||||||
|
fetchKnowledgeSet();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
onAddTag: async (tagName: string) => {
|
||||||
|
if (knowledgeSet) {
|
||||||
|
const currentTags = knowledgeSet.tags || [];
|
||||||
|
const newTagName = tagName?.trim();
|
||||||
|
if (!newTagName) return;
|
||||||
|
await updateKnowledgeSetByIdUsingPut(knowledgeSet.id, {
|
||||||
|
name: knowledgeSet.name,
|
||||||
|
description: knowledgeSet.description,
|
||||||
|
status: knowledgeSet.status,
|
||||||
|
domain: knowledgeSet.domain,
|
||||||
|
businessLine: knowledgeSet.businessLine,
|
||||||
|
owner: knowledgeSet.owner,
|
||||||
|
validFrom: knowledgeSet.validFrom,
|
||||||
|
validTo: knowledgeSet.validTo,
|
||||||
|
sourceType: knowledgeSet.sourceType,
|
||||||
|
sensitivity: knowledgeSet.sensitivity,
|
||||||
|
metadata: knowledgeSet.metadata,
|
||||||
|
tags: [...currentTags.map((tag) => tag.name), newTagName],
|
||||||
|
});
|
||||||
|
fetchKnowledgeSet();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
<CreateKnowledgeSet
|
<CreateKnowledgeSet
|
||||||
@@ -934,7 +1015,7 @@ const KnowledgeSetDetail = () => {
|
|||||||
<Descriptions.Item label="领域">{knowledgeSet?.domain || "-"}</Descriptions.Item>
|
<Descriptions.Item label="领域">{knowledgeSet?.domain || "-"}</Descriptions.Item>
|
||||||
<Descriptions.Item label="业务线">{knowledgeSet?.businessLine || "-"}</Descriptions.Item>
|
<Descriptions.Item label="业务线">{knowledgeSet?.businessLine || "-"}</Descriptions.Item>
|
||||||
<Descriptions.Item label="负责人">{knowledgeSet?.owner || "-"}</Descriptions.Item>
|
<Descriptions.Item label="负责人">{knowledgeSet?.owner || "-"}</Descriptions.Item>
|
||||||
<Descriptions.Item label="敏感级别">{knowledgeSet?.sensitivity || "-"}</Descriptions.Item>
|
{/* <Descriptions.Item label="敏感级别">{knowledgeSet?.sensitivity || "-"}</Descriptions.Item> */}
|
||||||
<Descriptions.Item label="有效期">
|
<Descriptions.Item label="有效期">
|
||||||
{knowledgeSet?.validFrom || "-"} ~ {knowledgeSet?.validTo || "-"}
|
{knowledgeSet?.validFrom || "-"} ~ {knowledgeSet?.validTo || "-"}
|
||||||
</Descriptions.Item>
|
</Descriptions.Item>
|
||||||
|
|||||||
@@ -257,7 +257,7 @@ export default function KnowledgeManagementPage() {
|
|||||||
return (
|
return (
|
||||||
<div className="h-full flex flex-col gap-4">
|
<div className="h-full flex flex-col gap-4">
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<h1 className="text-xl font-bold">知识管理</h1>
|
<h1 className="text-xl font-bold">知识集</h1>
|
||||||
<div className="flex gap-2 items-center">
|
<div className="flex gap-2 items-center">
|
||||||
<Button onClick={() => navigate("/data/knowledge-management/search")}>
|
<Button onClick={() => navigate("/data/knowledge-management/search")}>
|
||||||
全库搜索
|
全库搜索
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import {
|
|||||||
import {
|
import {
|
||||||
knowledgeSourceTypeOptions,
|
knowledgeSourceTypeOptions,
|
||||||
knowledgeStatusOptions,
|
knowledgeStatusOptions,
|
||||||
|
// sensitivityOptions,
|
||||||
} from "../knowledge-management.const";
|
} from "../knowledge-management.const";
|
||||||
import {
|
import {
|
||||||
KnowledgeSet,
|
KnowledgeSet,
|
||||||
@@ -169,9 +170,9 @@ export default function CreateKnowledgeSet({
|
|||||||
<Form.Item label="负责人" name="owner">
|
<Form.Item label="负责人" name="owner">
|
||||||
<Input placeholder="请输入负责人" />
|
<Input placeholder="请输入负责人" />
|
||||||
</Form.Item>
|
</Form.Item>
|
||||||
<Form.Item label="敏感级别" name="sensitivity">
|
{/* <Form.Item label="敏感级别" name="sensitivity">
|
||||||
<Input placeholder="请输入敏感级别" />
|
<Select options={sensitivityOptions} placeholder="请选择敏感级别" />
|
||||||
</Form.Item>
|
</Form.Item> */}
|
||||||
</div>
|
</div>
|
||||||
<div className="grid grid-cols-2 gap-4">
|
<div className="grid grid-cols-2 gap-4">
|
||||||
<Form.Item label="有效期开始" name="validFrom">
|
<Form.Item label="有效期开始" name="validFrom">
|
||||||
@@ -191,9 +192,6 @@ export default function CreateKnowledgeSet({
|
|||||||
placeholder="请选择或输入标签"
|
placeholder="请选择或输入标签"
|
||||||
/>
|
/>
|
||||||
</Form.Item>
|
</Form.Item>
|
||||||
<Form.Item label="扩展元数据" name="metadata">
|
|
||||||
<Input.TextArea placeholder="请输入元数据(JSON)" rows={3} />
|
|
||||||
</Form.Item>
|
|
||||||
</Form>
|
</Form>
|
||||||
</Modal>
|
</Modal>
|
||||||
</>
|
</>
|
||||||
|
|||||||
@@ -66,6 +66,11 @@ export const knowledgeSourceTypeOptions = [
|
|||||||
{ label: "文件上传", value: KnowledgeSourceType.FILE_UPLOAD },
|
{ label: "文件上传", value: KnowledgeSourceType.FILE_UPLOAD },
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// export const sensitivityOptions = [
|
||||||
|
// { label: "敏感", value: "敏感" },
|
||||||
|
// { label: "不敏感", value: "不敏感" },
|
||||||
|
// ];
|
||||||
|
|
||||||
export type KnowledgeSetView = {
|
export type KnowledgeSetView = {
|
||||||
id: string;
|
id: string;
|
||||||
name: string;
|
name: string;
|
||||||
|
|||||||
@@ -3,25 +3,28 @@ import {
|
|||||||
preUploadUsingPost,
|
preUploadUsingPost,
|
||||||
uploadFileChunkUsingPost,
|
uploadFileChunkUsingPost,
|
||||||
} from "@/pages/DataManagement/dataset.api";
|
} from "@/pages/DataManagement/dataset.api";
|
||||||
import { Button, Empty, Progress } from "antd";
|
import { Button, Empty, Progress, Tag } from "antd";
|
||||||
import { DeleteOutlined } from "@ant-design/icons";
|
import { DeleteOutlined, FileTextOutlined } from "@ant-design/icons";
|
||||||
import { useEffect } from "react";
|
import { useEffect } from "react";
|
||||||
import { useFileSliceUpload } from "@/hooks/useSliceUpload";
|
import { useFileSliceUpload } from "@/hooks/useSliceUpload";
|
||||||
|
|
||||||
export default function TaskUpload() {
|
export default function TaskUpload() {
|
||||||
const { createTask, taskList, removeTask, handleUpload } = useFileSliceUpload(
|
const { createTask, taskList, removeTask, handleUpload, registerStreamUploadListener } = useFileSliceUpload(
|
||||||
{
|
{
|
||||||
preUpload: preUploadUsingPost,
|
preUpload: preUploadUsingPost,
|
||||||
uploadChunk: uploadFileChunkUsingPost,
|
uploadChunk: uploadFileChunkUsingPost,
|
||||||
cancelUpload: cancelUploadUsingPut,
|
cancelUpload: cancelUploadUsingPut,
|
||||||
}
|
},
|
||||||
|
true, // showTaskCenter
|
||||||
|
true // enableStreamUpload
|
||||||
);
|
);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const uploadHandler = (e: any) => {
|
const uploadHandler = (e: Event) => {
|
||||||
console.log('[TaskUpload] Received upload event detail:', e.detail);
|
const customEvent = e as CustomEvent;
|
||||||
const { files } = e.detail;
|
console.log('[TaskUpload] Received upload event detail:', customEvent.detail);
|
||||||
const task = createTask(e.detail);
|
const { files } = customEvent.detail;
|
||||||
|
const task = createTask(customEvent.detail);
|
||||||
console.log('[TaskUpload] Created task with prefix:', task.prefix);
|
console.log('[TaskUpload] Created task with prefix:', task.prefix);
|
||||||
handleUpload({ task, files });
|
handleUpload({ task, files });
|
||||||
};
|
};
|
||||||
@@ -29,7 +32,13 @@ export default function TaskUpload() {
|
|||||||
return () => {
|
return () => {
|
||||||
window.removeEventListener("upload:dataset", uploadHandler);
|
window.removeEventListener("upload:dataset", uploadHandler);
|
||||||
};
|
};
|
||||||
}, []);
|
}, [createTask, handleUpload]);
|
||||||
|
|
||||||
|
// 注册流式上传监听器
|
||||||
|
useEffect(() => {
|
||||||
|
const unregister = registerStreamUploadListener();
|
||||||
|
return unregister;
|
||||||
|
}, [registerStreamUploadListener]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div
|
<div
|
||||||
@@ -55,7 +64,22 @@ export default function TaskUpload() {
|
|||||||
></Button>
|
></Button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<Progress size="small" percent={task.percent} />
|
<Progress size="small" percent={Number(task.percent)} />
|
||||||
|
{task.streamUploadInfo && (
|
||||||
|
<div className="flex items-center gap-2 text-xs text-gray-500 mt-1">
|
||||||
|
<Tag icon={<FileTextOutlined />} size="small">
|
||||||
|
按行分割
|
||||||
|
</Tag>
|
||||||
|
<span>
|
||||||
|
已上传: {task.streamUploadInfo.uploadedLines} 行
|
||||||
|
</span>
|
||||||
|
{task.streamUploadInfo.totalFiles > 1 && (
|
||||||
|
<span>
|
||||||
|
({task.streamUploadInfo.fileIndex}/{task.streamUploadInfo.totalFiles} 文件)
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
))}
|
))}
|
||||||
{taskList.length === 0 && (
|
{taskList.length === 0 && (
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ const LoginPage: React.FC = () => {
|
|||||||
<Title level={2} className="!text-white !mb-2 tracking-wide font-bold">
|
<Title level={2} className="!text-white !mb-2 tracking-wide font-bold">
|
||||||
DataBuilder
|
DataBuilder
|
||||||
</Title>
|
</Title>
|
||||||
<Text className="text-gray-400 text-sm tracking-wider">
|
<Text className="text-gray-400! text-sm tracking-wider">
|
||||||
一站式数据工作平台
|
一站式数据工作平台
|
||||||
</Text>
|
</Text>
|
||||||
</div>
|
</div>
|
||||||
@@ -100,8 +100,8 @@ const LoginPage: React.FC = () => {
|
|||||||
</Form.Item>
|
</Form.Item>
|
||||||
|
|
||||||
<div className="text-center mt-4">
|
<div className="text-center mt-4">
|
||||||
<Text className="text-gray-600 text-xs">
|
<Text className="text-gray-600! text-xs">
|
||||||
企业级数据处理平台 · 安全接入
|
数据处理平台 · 安全接入
|
||||||
</Text>
|
</Text>
|
||||||
</div>
|
</div>
|
||||||
</Form>
|
</Form>
|
||||||
|
|||||||
@@ -1,79 +1,657 @@
|
|||||||
import { UploadFile } from "antd";
|
import { UploadFile } from "antd";
|
||||||
import jsSHA from "jssha";
|
import jsSHA from "jssha";
|
||||||
|
|
||||||
const CHUNK_SIZE = 1024 * 1024 * 60;
|
// 默认分片大小:5MB(适合大多数网络环境)
|
||||||
|
export const DEFAULT_CHUNK_SIZE = 1024 * 1024 * 5;
|
||||||
|
// 大文件阈值:10MB
|
||||||
|
export const LARGE_FILE_THRESHOLD = 1024 * 1024 * 10;
|
||||||
|
// 最大并发上传数
|
||||||
|
export const MAX_CONCURRENT_UPLOADS = 3;
|
||||||
|
// 文本文件读取块大小:20MB(用于计算 SHA256)
|
||||||
|
const BUFFER_CHUNK_SIZE = 1024 * 1024 * 20;
|
||||||
|
|
||||||
export function sliceFile(file, chunkSize = CHUNK_SIZE): Blob[] {
|
/**
|
||||||
|
* 将文件分割为多个分片
|
||||||
|
* @param file 文件对象
|
||||||
|
* @param chunkSize 分片大小(字节),默认 5MB
|
||||||
|
* @returns 分片数组(Blob 列表)
|
||||||
|
*/
|
||||||
|
export function sliceFile(file: Blob, chunkSize = DEFAULT_CHUNK_SIZE): Blob[] {
|
||||||
const totalSize = file.size;
|
const totalSize = file.size;
|
||||||
|
const chunks: Blob[] = [];
|
||||||
|
|
||||||
|
// 小文件不需要分片
|
||||||
|
if (totalSize <= chunkSize) {
|
||||||
|
return [file];
|
||||||
|
}
|
||||||
|
|
||||||
let start = 0;
|
let start = 0;
|
||||||
let end = start + chunkSize;
|
|
||||||
const chunks = [];
|
|
||||||
while (start < totalSize) {
|
while (start < totalSize) {
|
||||||
|
const end = Math.min(start + chunkSize, totalSize);
|
||||||
const blob = file.slice(start, end);
|
const blob = file.slice(start, end);
|
||||||
chunks.push(blob);
|
chunks.push(blob);
|
||||||
|
|
||||||
start = end;
|
start = end;
|
||||||
end = start + chunkSize;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function calculateSHA256(file: Blob): Promise<string> {
|
/**
|
||||||
let count = 0;
|
* 计算文件的 SHA256 哈希值
|
||||||
const hash = new jsSHA("SHA-256", "ARRAYBUFFER", { encoding: "UTF8" });
|
* @param file 文件 Blob
|
||||||
|
* @param onProgress 进度回调(可选)
|
||||||
|
* @returns SHA256 哈希字符串
|
||||||
|
*/
|
||||||
|
export function calculateSHA256(
|
||||||
|
file: Blob,
|
||||||
|
onProgress?: (percent: number) => void
|
||||||
|
): Promise<string> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
|
const hash = new jsSHA("SHA-256", "ARRAYBUFFER", { encoding: "UTF8" });
|
||||||
const reader = new FileReader();
|
const reader = new FileReader();
|
||||||
|
let processedSize = 0;
|
||||||
|
|
||||||
function readChunk(start: number, end: number) {
|
function readChunk(start: number, end: number) {
|
||||||
const slice = file.slice(start, end);
|
const slice = file.slice(start, end);
|
||||||
reader.readAsArrayBuffer(slice);
|
reader.readAsArrayBuffer(slice);
|
||||||
}
|
}
|
||||||
|
|
||||||
const bufferChunkSize = 1024 * 1024 * 20;
|
|
||||||
|
|
||||||
function processChunk(offset: number) {
|
function processChunk(offset: number) {
|
||||||
const start = offset;
|
const start = offset;
|
||||||
const end = Math.min(start + bufferChunkSize, file.size);
|
const end = Math.min(start + BUFFER_CHUNK_SIZE, file.size);
|
||||||
count = end;
|
|
||||||
|
|
||||||
readChunk(start, end);
|
readChunk(start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
reader.onloadend = function () {
|
reader.onloadend = function (e) {
|
||||||
const arraybuffer = reader.result;
|
const arraybuffer = reader.result as ArrayBuffer;
|
||||||
|
if (!arraybuffer) {
|
||||||
|
reject(new Error("Failed to read file"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
hash.update(arraybuffer);
|
hash.update(arraybuffer);
|
||||||
if (count < file.size) {
|
processedSize += (e.target as FileReader).result?.byteLength || 0;
|
||||||
processChunk(count);
|
|
||||||
|
if (onProgress) {
|
||||||
|
const percent = Math.min(100, Math.round((processedSize / file.size) * 100));
|
||||||
|
onProgress(percent);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (processedSize < file.size) {
|
||||||
|
processChunk(processedSize);
|
||||||
} else {
|
} else {
|
||||||
resolve(hash.getHash("HEX", { outputLen: 256 }));
|
resolve(hash.getHash("HEX", { outputLen: 256 }));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
reader.onerror = () => reject(new Error("File reading failed"));
|
||||||
processChunk(0);
|
processChunk(0);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 批量计算多个文件的 SHA256
|
||||||
|
* @param files 文件列表
|
||||||
|
* @param onFileProgress 单个文件进度回调(可选)
|
||||||
|
* @returns 哈希值数组
|
||||||
|
*/
|
||||||
|
export async function calculateSHA256Batch(
|
||||||
|
files: Blob[],
|
||||||
|
onFileProgress?: (index: number, percent: number) => void
|
||||||
|
): Promise<string[]> {
|
||||||
|
const results: string[] = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < files.length; i++) {
|
||||||
|
const hash = await calculateSHA256(files[i], (percent) => {
|
||||||
|
onFileProgress?.(i, percent);
|
||||||
|
});
|
||||||
|
results.push(hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 检查文件是否存在(未被修改或删除)
|
||||||
|
* @param fileList 文件列表
|
||||||
|
* @returns 返回第一个不存在的文件,或 null(如果都存在)
|
||||||
|
*/
|
||||||
export function checkIsFilesExist(
|
export function checkIsFilesExist(
|
||||||
fileList: UploadFile[]
|
fileList: Array<{ originFile?: Blob }>
|
||||||
): Promise<UploadFile | null> {
|
): Promise<{ originFile?: Blob } | null> {
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
const loadEndFn = (file: UploadFile, reachEnd: boolean, e) => {
|
if (!fileList.length) {
|
||||||
const fileNotExist = !e.target.result;
|
resolve(null);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let checkedCount = 0;
|
||||||
|
const totalCount = fileList.length;
|
||||||
|
|
||||||
|
const loadEndFn = (file: { originFile?: Blob }, e: ProgressEvent<FileReader>) => {
|
||||||
|
checkedCount++;
|
||||||
|
const fileNotExist = !e.target?.result;
|
||||||
if (fileNotExist) {
|
if (fileNotExist) {
|
||||||
resolve(file);
|
resolve(file);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if (reachEnd) {
|
if (checkedCount >= totalCount) {
|
||||||
resolve(null);
|
resolve(null);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
for (let i = 0; i < fileList.length; i++) {
|
for (const file of fileList) {
|
||||||
const { originFile: file } = fileList[i];
|
|
||||||
const fileReader = new FileReader();
|
const fileReader = new FileReader();
|
||||||
fileReader.readAsArrayBuffer(file);
|
const actualFile = file.originFile;
|
||||||
fileReader.onloadend = (e) =>
|
|
||||||
loadEndFn(fileList[i], i === fileList.length - 1, e);
|
if (!actualFile) {
|
||||||
|
checkedCount++;
|
||||||
|
if (checkedCount >= totalCount) {
|
||||||
|
resolve(null);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
fileReader.readAsArrayBuffer(actualFile.slice(0, 1));
|
||||||
|
fileReader.onloadend = (e) => loadEndFn(file, e);
|
||||||
|
fileReader.onerror = () => {
|
||||||
|
checkedCount++;
|
||||||
|
resolve(file);
|
||||||
|
};
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断文件是否为大文件
|
||||||
|
* @param size 文件大小(字节)
|
||||||
|
* @param threshold 阈值(字节),默认 10MB
|
||||||
|
*/
|
||||||
|
export function isLargeFile(size: number, threshold = LARGE_FILE_THRESHOLD): boolean {
|
||||||
|
return size > threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 格式化文件大小为人类可读格式
|
||||||
|
* @param bytes 字节数
|
||||||
|
* @param decimals 小数位数
|
||||||
|
*/
|
||||||
|
export function formatFileSize(bytes: number, decimals = 2): string {
|
||||||
|
if (bytes === 0) return "0 B";
|
||||||
|
|
||||||
|
const k = 1024;
|
||||||
|
const sizes = ["B", "KB", "MB", "GB", "TB", "PB"];
|
||||||
|
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
||||||
|
|
||||||
|
return `${parseFloat((bytes / Math.pow(k, i)).toFixed(decimals))} ${sizes[i]}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 并发执行异步任务
|
||||||
|
* @param tasks 任务函数数组
|
||||||
|
* @param maxConcurrency 最大并发数
|
||||||
|
* @param onTaskComplete 单个任务完成回调(可选)
|
||||||
|
*/
|
||||||
|
export async function runConcurrentTasks<T>(
|
||||||
|
tasks: (() => Promise<T>)[],
|
||||||
|
maxConcurrency: number,
|
||||||
|
onTaskComplete?: (index: number, result: T) => void
|
||||||
|
): Promise<T[]> {
|
||||||
|
const results: T[] = new Array(tasks.length);
|
||||||
|
let index = 0;
|
||||||
|
|
||||||
|
async function runNext(): Promise<void> {
|
||||||
|
const currentIndex = index++;
|
||||||
|
if (currentIndex >= tasks.length) return;
|
||||||
|
|
||||||
|
const result = await tasks[currentIndex]();
|
||||||
|
results[currentIndex] = result;
|
||||||
|
onTaskComplete?.(currentIndex, result);
|
||||||
|
|
||||||
|
await runNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
const workers = Array(Math.min(maxConcurrency, tasks.length))
|
||||||
|
.fill(null)
|
||||||
|
.map(() => runNext());
|
||||||
|
|
||||||
|
await Promise.all(workers);
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 按行分割文本文件内容
|
||||||
|
* @param text 文本内容
|
||||||
|
* @param skipEmptyLines 是否跳过空行,默认 true
|
||||||
|
* @returns 行数组
|
||||||
|
*/
|
||||||
|
export function splitTextByLines(text: string, skipEmptyLines = true): string[] {
|
||||||
|
const lines = text.split(/\r?\n/);
|
||||||
|
if (skipEmptyLines) {
|
||||||
|
return lines.filter((line) => line.trim() !== "");
|
||||||
|
}
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建分片信息对象
|
||||||
|
* @param file 原始文件
|
||||||
|
* @param chunkSize 分片大小
|
||||||
|
*/
|
||||||
|
export function createFileSliceInfo(
|
||||||
|
file: File | Blob,
|
||||||
|
chunkSize = DEFAULT_CHUNK_SIZE
|
||||||
|
): {
|
||||||
|
originFile: Blob;
|
||||||
|
slices: Blob[];
|
||||||
|
name: string;
|
||||||
|
size: number;
|
||||||
|
totalChunks: number;
|
||||||
|
} {
|
||||||
|
const slices = sliceFile(file, chunkSize);
|
||||||
|
return {
|
||||||
|
originFile: file,
|
||||||
|
slices,
|
||||||
|
name: (file as File).name || "unnamed",
|
||||||
|
size: file.size,
|
||||||
|
totalChunks: slices.length,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 支持的文本文件 MIME 类型前缀
|
||||||
|
*/
|
||||||
|
export const TEXT_FILE_MIME_PREFIX = "text/";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 支持的文本文件 MIME 类型集合
|
||||||
|
*/
|
||||||
|
export const TEXT_FILE_MIME_TYPES = new Set([
|
||||||
|
"application/json",
|
||||||
|
"application/xml",
|
||||||
|
"application/csv",
|
||||||
|
"application/ndjson",
|
||||||
|
"application/x-ndjson",
|
||||||
|
"application/x-yaml",
|
||||||
|
"application/yaml",
|
||||||
|
"application/javascript",
|
||||||
|
"application/x-javascript",
|
||||||
|
"application/sql",
|
||||||
|
"application/rtf",
|
||||||
|
"application/xhtml+xml",
|
||||||
|
"application/svg+xml",
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 支持的文本文件扩展名集合
|
||||||
|
*/
|
||||||
|
export const TEXT_FILE_EXTENSIONS = new Set([
|
||||||
|
".txt",
|
||||||
|
".md",
|
||||||
|
".markdown",
|
||||||
|
".csv",
|
||||||
|
".tsv",
|
||||||
|
".json",
|
||||||
|
".jsonl",
|
||||||
|
".ndjson",
|
||||||
|
".log",
|
||||||
|
".xml",
|
||||||
|
".yaml",
|
||||||
|
".yml",
|
||||||
|
".sql",
|
||||||
|
".js",
|
||||||
|
".ts",
|
||||||
|
".jsx",
|
||||||
|
".tsx",
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
".css",
|
||||||
|
".scss",
|
||||||
|
".less",
|
||||||
|
".py",
|
||||||
|
".java",
|
||||||
|
".c",
|
||||||
|
".cpp",
|
||||||
|
".h",
|
||||||
|
".hpp",
|
||||||
|
".go",
|
||||||
|
".rs",
|
||||||
|
".rb",
|
||||||
|
".php",
|
||||||
|
".sh",
|
||||||
|
".bash",
|
||||||
|
".zsh",
|
||||||
|
".ps1",
|
||||||
|
".bat",
|
||||||
|
".cmd",
|
||||||
|
".svg",
|
||||||
|
".rtf",
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断文件是否为文本文件(支持 UploadFile 类型)
|
||||||
|
* @param file UploadFile 对象
|
||||||
|
*/
|
||||||
|
export function isTextUploadFile(file: UploadFile): boolean {
|
||||||
|
const mimeType = (file.type || "").toLowerCase();
|
||||||
|
if (mimeType) {
|
||||||
|
if (mimeType.startsWith(TEXT_FILE_MIME_PREFIX)) return true;
|
||||||
|
if (TEXT_FILE_MIME_TYPES.has(mimeType)) return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const fileName = file.name || "";
|
||||||
|
const dotIndex = fileName.lastIndexOf(".");
|
||||||
|
if (dotIndex < 0) return false;
|
||||||
|
const ext = fileName.slice(dotIndex).toLowerCase();
|
||||||
|
return TEXT_FILE_EXTENSIONS.has(ext);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断文件名是否为文本文件
|
||||||
|
* @param fileName 文件名
|
||||||
|
*/
|
||||||
|
export function isTextFileByName(fileName: string): boolean {
|
||||||
|
const lowerName = fileName.toLowerCase();
|
||||||
|
|
||||||
|
// 先检查 MIME 类型(如果有)
|
||||||
|
// 这里简化处理,主要通过扩展名判断
|
||||||
|
|
||||||
|
const dotIndex = lowerName.lastIndexOf(".");
|
||||||
|
if (dotIndex < 0) return false;
|
||||||
|
const ext = lowerName.slice(dotIndex);
|
||||||
|
return TEXT_FILE_EXTENSIONS.has(ext);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取文件扩展名
|
||||||
|
* @param fileName 文件名
|
||||||
|
*/
|
||||||
|
export function getFileExtension(fileName: string): string {
|
||||||
|
const dotIndex = fileName.lastIndexOf(".");
|
||||||
|
if (dotIndex < 0) return "";
|
||||||
|
return fileName.slice(dotIndex).toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 安全地读取文件为文本
|
||||||
|
* @param file 文件对象
|
||||||
|
* @param encoding 编码,默认 UTF-8
|
||||||
|
*/
|
||||||
|
export function readFileAsText(
|
||||||
|
file: File | Blob,
|
||||||
|
encoding = "UTF-8"
|
||||||
|
): Promise<string> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onload = (e) => resolve(e.target?.result as string);
|
||||||
|
reader.onerror = () => reject(new Error("Failed to read file"));
|
||||||
|
reader.readAsText(file, encoding);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 流式分割文件并逐行上传
|
||||||
|
* 使用 Blob.slice 逐块读取,避免一次性加载大文件到内存
|
||||||
|
* @param file 文件对象
|
||||||
|
* @param datasetId 数据集ID
|
||||||
|
* @param uploadFn 上传函数,接收 FormData 和配置,返回 Promise
|
||||||
|
* @param onProgress 进度回调 (currentBytes, totalBytes, uploadedLines)
|
||||||
|
* @param chunkSize 每次读取的块大小,默认 1MB
|
||||||
|
* @param options 其他选项
|
||||||
|
* @returns 上传结果统计
|
||||||
|
*/
|
||||||
|
export interface StreamUploadOptions {
|
||||||
|
reqId?: number;
|
||||||
|
resolveReqId?: (params: { totalFileNum: number; totalSize: number }) => Promise<number>;
|
||||||
|
onReqIdResolved?: (reqId: number) => void;
|
||||||
|
fileNamePrefix?: string;
|
||||||
|
hasArchive?: boolean;
|
||||||
|
prefix?: string;
|
||||||
|
signal?: AbortSignal;
|
||||||
|
maxConcurrency?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface StreamUploadResult {
|
||||||
|
uploadedCount: number;
|
||||||
|
totalBytes: number;
|
||||||
|
skippedEmptyCount: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function processFileLines(
|
||||||
|
file: File,
|
||||||
|
chunkSize: number,
|
||||||
|
signal: AbortSignal | undefined,
|
||||||
|
onLine?: (line: string, index: number) => Promise<void> | void,
|
||||||
|
onProgress?: (currentBytes: number, totalBytes: number, processedLines: number) => void
|
||||||
|
): Promise<{ lineCount: number; skippedEmptyCount: number }> {
|
||||||
|
const fileSize = file.size;
|
||||||
|
let offset = 0;
|
||||||
|
let buffer = "";
|
||||||
|
let skippedEmptyCount = 0;
|
||||||
|
let lineIndex = 0;
|
||||||
|
|
||||||
|
while (offset < fileSize) {
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
|
const end = Math.min(offset + chunkSize, fileSize);
|
||||||
|
const chunk = file.slice(offset, end);
|
||||||
|
const text = await readFileAsText(chunk);
|
||||||
|
const combined = buffer + text;
|
||||||
|
const lines = combined.split(/\r?\n/);
|
||||||
|
buffer = lines.pop() || "";
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
if (!line.trim()) {
|
||||||
|
skippedEmptyCount++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const currentIndex = lineIndex;
|
||||||
|
lineIndex += 1;
|
||||||
|
if (onLine) {
|
||||||
|
await onLine(line, currentIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
offset = end;
|
||||||
|
onProgress?.(offset, fileSize, lineIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (buffer.trim()) {
|
||||||
|
const currentIndex = lineIndex;
|
||||||
|
lineIndex += 1;
|
||||||
|
if (onLine) {
|
||||||
|
await onLine(buffer, currentIndex);
|
||||||
|
}
|
||||||
|
} else if (buffer.length > 0) {
|
||||||
|
skippedEmptyCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { lineCount: lineIndex, skippedEmptyCount };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function streamSplitAndUpload(
|
||||||
|
file: File,
|
||||||
|
uploadFn: (formData: FormData, config?: { onUploadProgress?: (e: { loaded: number; total: number }) => void }) => Promise<unknown>,
|
||||||
|
onProgress?: (currentBytes: number, totalBytes: number, uploadedLines: number) => void,
|
||||||
|
chunkSize: number = 1024 * 1024, // 1MB
|
||||||
|
options: StreamUploadOptions
|
||||||
|
): Promise<StreamUploadResult> {
|
||||||
|
const {
|
||||||
|
reqId: initialReqId,
|
||||||
|
resolveReqId,
|
||||||
|
onReqIdResolved,
|
||||||
|
fileNamePrefix,
|
||||||
|
prefix,
|
||||||
|
signal,
|
||||||
|
maxConcurrency = 3,
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
const fileSize = file.size;
|
||||||
|
let uploadedCount = 0;
|
||||||
|
let skippedEmptyCount = 0;
|
||||||
|
|
||||||
|
// 获取文件名基础部分和扩展名
|
||||||
|
const originalFileName = fileNamePrefix || file.name;
|
||||||
|
const lastDotIndex = originalFileName.lastIndexOf(".");
|
||||||
|
const baseName = lastDotIndex > 0 ? originalFileName.slice(0, lastDotIndex) : originalFileName;
|
||||||
|
const fileExtension = lastDotIndex > 0 ? originalFileName.slice(lastDotIndex) : "";
|
||||||
|
|
||||||
|
let resolvedReqId = initialReqId;
|
||||||
|
if (!resolvedReqId) {
|
||||||
|
const scanResult = await processFileLines(file, chunkSize, signal);
|
||||||
|
const totalFileNum = scanResult.lineCount;
|
||||||
|
skippedEmptyCount = scanResult.skippedEmptyCount;
|
||||||
|
if (totalFileNum === 0) {
|
||||||
|
return {
|
||||||
|
uploadedCount: 0,
|
||||||
|
totalBytes: fileSize,
|
||||||
|
skippedEmptyCount,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
if (!resolveReqId) {
|
||||||
|
throw new Error("Missing pre-upload request id");
|
||||||
|
}
|
||||||
|
resolvedReqId = await resolveReqId({ totalFileNum, totalSize: fileSize });
|
||||||
|
if (!resolvedReqId) {
|
||||||
|
throw new Error("Failed to resolve pre-upload request id");
|
||||||
|
}
|
||||||
|
onReqIdResolved?.(resolvedReqId);
|
||||||
|
}
|
||||||
|
if (!resolvedReqId) {
|
||||||
|
throw new Error("Missing pre-upload request id");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 上传单行内容
|
||||||
|
* 每行作为独立文件上传,fileNo 对应行序号,chunkNo 固定为 1
|
||||||
|
*/
|
||||||
|
async function uploadLine(line: string, index: number): Promise<void> {
|
||||||
|
// 检查是否已取消
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!line.trim()) {
|
||||||
|
skippedEmptyCount++;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 保留原始文件扩展名
|
||||||
|
const fileIndex = index + 1;
|
||||||
|
const newFileName = `${baseName}_${String(fileIndex).padStart(6, "0")}${fileExtension}`;
|
||||||
|
const blob = new Blob([line], { type: "text/plain" });
|
||||||
|
const lineFile = new File([blob], newFileName, { type: "text/plain" });
|
||||||
|
|
||||||
|
// 计算分片(小文件通常只需要一个分片)
|
||||||
|
const slices = sliceFile(lineFile, DEFAULT_CHUNK_SIZE);
|
||||||
|
const checkSum = await calculateSHA256(slices[0]);
|
||||||
|
|
||||||
|
// 检查是否已取消(计算哈希后)
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append("file", slices[0]);
|
||||||
|
formData.append("reqId", resolvedReqId.toString());
|
||||||
|
// 每行作为独立文件上传
|
||||||
|
formData.append("fileNo", fileIndex.toString());
|
||||||
|
formData.append("chunkNo", "1");
|
||||||
|
formData.append("fileName", newFileName);
|
||||||
|
formData.append("fileSize", lineFile.size.toString());
|
||||||
|
formData.append("totalChunkNum", "1");
|
||||||
|
formData.append("checkSumHex", checkSum);
|
||||||
|
if (prefix !== undefined) {
|
||||||
|
formData.append("prefix", prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
await uploadFn(formData, {
|
||||||
|
onUploadProgress: () => {
|
||||||
|
// 单行文件很小,进度主要用于追踪上传状态
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const inFlight = new Set<Promise<void>>();
|
||||||
|
let uploadError: unknown = null;
|
||||||
|
const enqueueUpload = async (line: string, index: number) => {
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new Error("Upload cancelled");
|
||||||
|
}
|
||||||
|
if (uploadError) {
|
||||||
|
throw uploadError;
|
||||||
|
}
|
||||||
|
const uploadPromise = uploadLine(line, index)
|
||||||
|
.then(() => {
|
||||||
|
uploadedCount++;
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
uploadError = err;
|
||||||
|
});
|
||||||
|
inFlight.add(uploadPromise);
|
||||||
|
uploadPromise.finally(() => inFlight.delete(uploadPromise));
|
||||||
|
if (inFlight.size >= maxConcurrency) {
|
||||||
|
await Promise.race(inFlight);
|
||||||
|
if (uploadError) {
|
||||||
|
throw uploadError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let uploadResult: { lineCount: number; skippedEmptyCount: number } | null = null;
|
||||||
|
try {
|
||||||
|
uploadResult = await processFileLines(
|
||||||
|
file,
|
||||||
|
chunkSize,
|
||||||
|
signal,
|
||||||
|
enqueueUpload,
|
||||||
|
(currentBytes, totalBytes) => {
|
||||||
|
onProgress?.(currentBytes, totalBytes, uploadedCount);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
if (uploadError) {
|
||||||
|
throw uploadError;
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (inFlight.size > 0) {
|
||||||
|
await Promise.allSettled(inFlight);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!uploadResult || (initialReqId && uploadResult.lineCount === 0)) {
|
||||||
|
return {
|
||||||
|
uploadedCount: 0,
|
||||||
|
totalBytes: fileSize,
|
||||||
|
skippedEmptyCount: uploadResult?.skippedEmptyCount ?? 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!initialReqId) {
|
||||||
|
skippedEmptyCount = skippedEmptyCount || uploadResult.skippedEmptyCount;
|
||||||
|
} else {
|
||||||
|
skippedEmptyCount = uploadResult.skippedEmptyCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
uploadedCount,
|
||||||
|
totalBytes: fileSize,
|
||||||
|
skippedEmptyCount,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 判断文件是否需要流式分割上传
|
||||||
|
* @param file 文件对象
|
||||||
|
* @param threshold 阈值,默认 5MB
|
||||||
|
*/
|
||||||
|
export function shouldStreamUpload(file: File, threshold: number = 5 * 1024 * 1024): boolean {
|
||||||
|
return file.size > threshold;
|
||||||
|
}
|
||||||
|
|||||||
@@ -92,6 +92,14 @@ class Request {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 监听 AbortSignal 来中止请求
|
||||||
|
if (config.signal) {
|
||||||
|
config.signal.addEventListener("abort", () => {
|
||||||
|
xhr.abort();
|
||||||
|
reject(new Error("上传已取消"));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// 监听上传进度
|
// 监听上传进度
|
||||||
xhr.upload.addEventListener("progress", function (event) {
|
xhr.upload.addEventListener("progress", function (event) {
|
||||||
if (event.lengthComputable) {
|
if (event.lengthComputable) {
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class Settings(BaseSettings):
|
|||||||
datamate_backend_base_url: str = "http://datamate-backend:8080/api"
|
datamate_backend_base_url: str = "http://datamate-backend:8080/api"
|
||||||
|
|
||||||
# 标注编辑器(Label Studio Editor)相关
|
# 标注编辑器(Label Studio Editor)相关
|
||||||
editor_max_text_bytes: int = 2 * 1024 * 1024 # 2MB,避免一次加载超大文本卡死前端
|
editor_max_text_bytes: int = 0 # <=0 表示不限制,正数为最大字节数
|
||||||
|
|
||||||
# 全局设置实例
|
# 全局设置实例
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
|
|||||||
@@ -61,13 +61,15 @@ class DatasetFiles(Base):
|
|||||||
dataset_id = Column(String(36), nullable=False, comment="所属数据集ID(UUID)")
|
dataset_id = Column(String(36), nullable=False, comment="所属数据集ID(UUID)")
|
||||||
file_name = Column(String(255), nullable=False, comment="文件名")
|
file_name = Column(String(255), nullable=False, comment="文件名")
|
||||||
file_path = Column(String(1000), nullable=False, comment="文件路径")
|
file_path = Column(String(1000), nullable=False, comment="文件路径")
|
||||||
|
logical_path = Column(String(1000), nullable=False, comment="文件逻辑路径(相对数据集根目录)")
|
||||||
|
version = Column(BigInteger, nullable=False, default=1, comment="文件版本号(同 logical_path 递增)")
|
||||||
file_type = Column(String(50), nullable=True, comment="文件格式:JPG/PNG/DCM/TXT等")
|
file_type = Column(String(50), nullable=True, comment="文件格式:JPG/PNG/DCM/TXT等")
|
||||||
file_size = Column(BigInteger, default=0, comment="文件大小(字节)")
|
file_size = Column(BigInteger, default=0, comment="文件大小(字节)")
|
||||||
check_sum = Column(String(64), nullable=True, comment="文件校验和")
|
check_sum = Column(String(64), nullable=True, comment="文件校验和")
|
||||||
tags = Column(JSON, nullable=True, comment="文件标签信息")
|
tags = Column(JSON, nullable=True, comment="文件标签信息")
|
||||||
tags_updated_at = Column(TIMESTAMP, nullable=True, comment="标签最后更新时间")
|
tags_updated_at = Column(TIMESTAMP, nullable=True, comment="标签最后更新时间")
|
||||||
dataset_filemetadata = Column("metadata", JSON, nullable=True, comment="文件元数据")
|
dataset_filemetadata = Column("metadata", JSON, nullable=True, comment="文件元数据")
|
||||||
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/DELETED/PROCESSING")
|
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/ARCHIVED/DELETED/PROCESSING")
|
||||||
upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间")
|
upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间")
|
||||||
last_access_time = Column(TIMESTAMP, nullable=True, comment="最后访问时间")
|
last_access_time = Column(TIMESTAMP, nullable=True, comment="最后访问时间")
|
||||||
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
|
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ from app.db.session import get_db
|
|||||||
from app.module.annotation.schema.editor import (
|
from app.module.annotation.schema.editor import (
|
||||||
EditorProjectInfo,
|
EditorProjectInfo,
|
||||||
EditorTaskListResponse,
|
EditorTaskListResponse,
|
||||||
|
EditorTaskSegmentResponse,
|
||||||
EditorTaskResponse,
|
EditorTaskResponse,
|
||||||
UpsertAnnotationRequest,
|
UpsertAnnotationRequest,
|
||||||
UpsertAnnotationResponse,
|
UpsertAnnotationResponse,
|
||||||
@@ -87,6 +88,21 @@ async def get_editor_task(
|
|||||||
return StandardResponse(code=200, message="success", data=task)
|
return StandardResponse(code=200, message="success", data=task)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/projects/{project_id}/tasks/{file_id}/segments",
|
||||||
|
response_model=StandardResponse[EditorTaskSegmentResponse],
|
||||||
|
)
|
||||||
|
async def get_editor_task_segment(
|
||||||
|
project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"),
|
||||||
|
file_id: str = Path(..., description="文件ID(t_dm_dataset_files.id)"),
|
||||||
|
segment_index: int = Query(..., ge=0, alias="segmentIndex", description="段落索引(从0开始)"),
|
||||||
|
db: AsyncSession = Depends(get_db),
|
||||||
|
):
|
||||||
|
service = AnnotationEditorService(db)
|
||||||
|
result = await service.get_task_segment(project_id, file_id, segment_index)
|
||||||
|
return StandardResponse(code=200, message="success", data=result)
|
||||||
|
|
||||||
|
|
||||||
@router.put(
|
@router.put(
|
||||||
"/projects/{project_id}/tasks/{file_id}/annotation",
|
"/projects/{project_id}/tasks/{file_id}/annotation",
|
||||||
response_model=StandardResponse[UpsertAnnotationResponse],
|
response_model=StandardResponse[UpsertAnnotationResponse],
|
||||||
|
|||||||
@@ -150,6 +150,18 @@ async def create_mapping(
|
|||||||
labeling_project, snapshot_file_ids
|
labeling_project, snapshot_file_ids
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 如果启用了分段且为文本数据集,预生成切片结构
|
||||||
|
if dataset_type == TEXT_DATASET_TYPE and request.segmentation_enabled:
|
||||||
|
try:
|
||||||
|
from ..service.editor import AnnotationEditorService
|
||||||
|
editor_service = AnnotationEditorService(db)
|
||||||
|
# 异步预计算切片(不阻塞创建响应)
|
||||||
|
segmentation_result = await editor_service.precompute_segmentation_for_project(labeling_project.id)
|
||||||
|
logger.info(f"Precomputed segmentation for project {labeling_project.id}: {segmentation_result}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to precompute segmentation for project {labeling_project.id}: {e}")
|
||||||
|
# 不影响项目创建,只记录警告
|
||||||
|
|
||||||
response_data = DatasetMappingCreateResponse(
|
response_data = DatasetMappingCreateResponse(
|
||||||
id=mapping.id,
|
id=mapping.id,
|
||||||
labeling_project_id=str(mapping.labeling_project_id),
|
labeling_project_id=str(mapping.labeling_project_id),
|
||||||
|
|||||||
@@ -79,12 +79,9 @@ class EditorTaskListResponse(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class SegmentInfo(BaseModel):
|
class SegmentInfo(BaseModel):
|
||||||
"""段落信息(用于文本分段标注)"""
|
"""段落摘要(用于文本分段标注)"""
|
||||||
|
|
||||||
idx: int = Field(..., description="段落索引")
|
idx: int = Field(..., description="段落索引")
|
||||||
text: str = Field(..., description="段落文本")
|
|
||||||
start: int = Field(..., description="在原文中的起始位置")
|
|
||||||
end: int = Field(..., description="在原文中的结束位置")
|
|
||||||
has_annotation: bool = Field(False, alias="hasAnnotation", description="该段落是否已有标注")
|
has_annotation: bool = Field(False, alias="hasAnnotation", description="该段落是否已有标注")
|
||||||
line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引(从0开始)")
|
line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引(从0开始)")
|
||||||
chunk_index: int = Field(0, alias="chunkIndex", description="行内分片索引(从0开始)")
|
chunk_index: int = Field(0, alias="chunkIndex", description="行内分片索引(从0开始)")
|
||||||
@@ -100,7 +97,29 @@ class EditorTaskResponse(BaseModel):
|
|||||||
|
|
||||||
# 分段相关字段
|
# 分段相关字段
|
||||||
segmented: bool = Field(False, description="是否启用分段模式")
|
segmented: bool = Field(False, description="是否启用分段模式")
|
||||||
segments: Optional[List[SegmentInfo]] = Field(None, description="段落列表")
|
total_segments: int = Field(0, alias="totalSegments", description="总段落数")
|
||||||
|
current_segment_index: int = Field(0, alias="currentSegmentIndex", description="当前段落索引")
|
||||||
|
|
||||||
|
model_config = ConfigDict(populate_by_name=True)
|
||||||
|
|
||||||
|
|
||||||
|
class SegmentDetail(BaseModel):
|
||||||
|
"""段落内容"""
|
||||||
|
|
||||||
|
idx: int = Field(..., description="段落索引")
|
||||||
|
text: str = Field(..., description="段落文本")
|
||||||
|
has_annotation: bool = Field(False, alias="hasAnnotation", description="该段落是否已有标注")
|
||||||
|
line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引(从0开始)")
|
||||||
|
chunk_index: int = Field(0, alias="chunkIndex", description="行内分片索引(从0开始)")
|
||||||
|
|
||||||
|
model_config = ConfigDict(populate_by_name=True)
|
||||||
|
|
||||||
|
|
||||||
|
class EditorTaskSegmentResponse(BaseModel):
|
||||||
|
"""编辑器单段内容响应"""
|
||||||
|
|
||||||
|
segmented: bool = Field(False, description="是否启用分段模式")
|
||||||
|
segment: Optional[SegmentDetail] = Field(None, description="段落内容")
|
||||||
total_segments: int = Field(0, alias="totalSegments", description="总段落数")
|
total_segments: int = Field(0, alias="totalSegments", description="总段落数")
|
||||||
current_segment_index: int = Field(0, alias="currentSegmentIndex", description="当前段落索引")
|
current_segment_index: int = Field(0, alias="currentSegmentIndex", description="当前段落索引")
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,9 @@ from app.module.annotation.schema.editor import (
|
|||||||
EditorProjectInfo,
|
EditorProjectInfo,
|
||||||
EditorTaskListItem,
|
EditorTaskListItem,
|
||||||
EditorTaskListResponse,
|
EditorTaskListResponse,
|
||||||
|
EditorTaskSegmentResponse,
|
||||||
EditorTaskResponse,
|
EditorTaskResponse,
|
||||||
|
SegmentDetail,
|
||||||
SegmentInfo,
|
SegmentInfo,
|
||||||
UpsertAnnotationRequest,
|
UpsertAnnotationRequest,
|
||||||
UpsertAnnotationResponse,
|
UpsertAnnotationResponse,
|
||||||
@@ -538,6 +540,50 @@ class AnnotationEditorService:
|
|||||||
return value
|
return value
|
||||||
return raw_text
|
return raw_text
|
||||||
|
|
||||||
|
def _build_segment_contexts(
|
||||||
|
self,
|
||||||
|
records: List[Tuple[Optional[Dict[str, Any]], str]],
|
||||||
|
record_texts: List[str],
|
||||||
|
segment_annotation_keys: set[str],
|
||||||
|
) -> Tuple[List[SegmentInfo], List[Tuple[Optional[Dict[str, Any]], str, str, int, int]]]:
|
||||||
|
splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
|
||||||
|
segments: List[SegmentInfo] = []
|
||||||
|
segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = []
|
||||||
|
segment_cursor = 0
|
||||||
|
|
||||||
|
for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)):
|
||||||
|
normalized_text = record_text or ""
|
||||||
|
if len(normalized_text) > self.SEGMENT_THRESHOLD:
|
||||||
|
raw_segments = splitter.split(normalized_text)
|
||||||
|
for chunk_index, seg in enumerate(raw_segments):
|
||||||
|
segments.append(
|
||||||
|
SegmentInfo(
|
||||||
|
idx=segment_cursor,
|
||||||
|
hasAnnotation=str(segment_cursor) in segment_annotation_keys,
|
||||||
|
lineIndex=record_index,
|
||||||
|
chunkIndex=chunk_index,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
segment_contexts.append((payload, raw_text, seg["text"], record_index, chunk_index))
|
||||||
|
segment_cursor += 1
|
||||||
|
else:
|
||||||
|
segments.append(
|
||||||
|
SegmentInfo(
|
||||||
|
idx=segment_cursor,
|
||||||
|
hasAnnotation=str(segment_cursor) in segment_annotation_keys,
|
||||||
|
lineIndex=record_index,
|
||||||
|
chunkIndex=0,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
segment_contexts.append((payload, raw_text, normalized_text, record_index, 0))
|
||||||
|
segment_cursor += 1
|
||||||
|
|
||||||
|
if not segments:
|
||||||
|
segments = [SegmentInfo(idx=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)]
|
||||||
|
segment_contexts = [(None, "", "", 0, 0)]
|
||||||
|
|
||||||
|
return segments, segment_contexts
|
||||||
|
|
||||||
async def get_project_info(self, project_id: str) -> EditorProjectInfo:
|
async def get_project_info(self, project_id: str) -> EditorProjectInfo:
|
||||||
project = await self._get_project_or_404(project_id)
|
project = await self._get_project_or_404(project_id)
|
||||||
|
|
||||||
@@ -668,6 +714,124 @@ class AnnotationEditorService:
|
|||||||
|
|
||||||
return await self._build_text_task(project, file_record, file_id, segment_index)
|
return await self._build_text_task(project, file_record, file_id, segment_index)
|
||||||
|
|
||||||
|
async def get_task_segment(
|
||||||
|
self,
|
||||||
|
project_id: str,
|
||||||
|
file_id: str,
|
||||||
|
segment_index: int,
|
||||||
|
) -> EditorTaskSegmentResponse:
|
||||||
|
project = await self._get_project_or_404(project_id)
|
||||||
|
|
||||||
|
dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
|
||||||
|
if dataset_type != DATASET_TYPE_TEXT:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="当前仅支持 TEXT 项目的段落内容",
|
||||||
|
)
|
||||||
|
|
||||||
|
file_result = await self.db.execute(
|
||||||
|
select(DatasetFiles).where(
|
||||||
|
DatasetFiles.id == file_id,
|
||||||
|
DatasetFiles.dataset_id == project.dataset_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
file_record = file_result.scalar_one_or_none()
|
||||||
|
if not file_record:
|
||||||
|
raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
|
||||||
|
|
||||||
|
if not self._resolve_segmentation_enabled(project):
|
||||||
|
return EditorTaskSegmentResponse(
|
||||||
|
segmented=False,
|
||||||
|
segment=None,
|
||||||
|
totalSegments=0,
|
||||||
|
currentSegmentIndex=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
|
||||||
|
assert isinstance(text_content, str)
|
||||||
|
label_config = await self._resolve_project_label_config(project)
|
||||||
|
primary_text_key = self._resolve_primary_text_key(label_config)
|
||||||
|
file_name = str(getattr(file_record, "file_name", "")).lower()
|
||||||
|
|
||||||
|
records: List[Tuple[Optional[Dict[str, Any]], str]] = []
|
||||||
|
if file_name.endswith(JSONL_EXTENSION):
|
||||||
|
records = self._parse_jsonl_records(text_content)
|
||||||
|
else:
|
||||||
|
parsed_payload = self._try_parse_json_payload(text_content)
|
||||||
|
if parsed_payload:
|
||||||
|
records = [(parsed_payload, text_content)]
|
||||||
|
|
||||||
|
if not records:
|
||||||
|
records = [(None, text_content)]
|
||||||
|
|
||||||
|
record_texts = [
|
||||||
|
self._resolve_primary_text_value(payload, raw_text, primary_text_key)
|
||||||
|
for payload, raw_text in records
|
||||||
|
]
|
||||||
|
if not record_texts:
|
||||||
|
record_texts = [text_content]
|
||||||
|
|
||||||
|
needs_segmentation = len(records) > 1 or any(
|
||||||
|
len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
|
||||||
|
)
|
||||||
|
if not needs_segmentation:
|
||||||
|
return EditorTaskSegmentResponse(
|
||||||
|
segmented=False,
|
||||||
|
segment=None,
|
||||||
|
totalSegments=0,
|
||||||
|
currentSegmentIndex=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
ann_result = await self.db.execute(
|
||||||
|
select(AnnotationResult).where(
|
||||||
|
AnnotationResult.project_id == project.id,
|
||||||
|
AnnotationResult.file_id == file_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ann = ann_result.scalar_one_or_none()
|
||||||
|
segment_annotations: Dict[str, Dict[str, Any]] = {}
|
||||||
|
if ann and isinstance(ann.annotation, dict):
|
||||||
|
segment_annotations = self._extract_segment_annotations(ann.annotation)
|
||||||
|
segment_annotation_keys = set(segment_annotations.keys())
|
||||||
|
|
||||||
|
segments, segment_contexts = self._build_segment_contexts(
|
||||||
|
records,
|
||||||
|
record_texts,
|
||||||
|
segment_annotation_keys,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_segments = len(segment_contexts)
|
||||||
|
if total_segments == 0:
|
||||||
|
return EditorTaskSegmentResponse(
|
||||||
|
segmented=False,
|
||||||
|
segment=None,
|
||||||
|
totalSegments=0,
|
||||||
|
currentSegmentIndex=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
if segment_index < 0 or segment_index >= total_segments:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"segmentIndex 超出范围: {segment_index}",
|
||||||
|
)
|
||||||
|
|
||||||
|
segment_info = segments[segment_index]
|
||||||
|
_, _, segment_text, line_index, chunk_index = segment_contexts[segment_index]
|
||||||
|
segment_detail = SegmentDetail(
|
||||||
|
idx=segment_info.idx,
|
||||||
|
text=segment_text,
|
||||||
|
hasAnnotation=segment_info.has_annotation,
|
||||||
|
lineIndex=line_index,
|
||||||
|
chunkIndex=chunk_index,
|
||||||
|
)
|
||||||
|
|
||||||
|
return EditorTaskSegmentResponse(
|
||||||
|
segmented=True,
|
||||||
|
segment=segment_detail,
|
||||||
|
totalSegments=total_segments,
|
||||||
|
currentSegmentIndex=segment_index,
|
||||||
|
)
|
||||||
|
|
||||||
async def _build_text_task(
|
async def _build_text_task(
|
||||||
self,
|
self,
|
||||||
project: LabelingProject,
|
project: LabelingProject,
|
||||||
@@ -723,7 +887,8 @@ class AnnotationEditorService:
|
|||||||
needs_segmentation = segmentation_enabled and (
|
needs_segmentation = segmentation_enabled and (
|
||||||
len(records) > 1 or any(len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts)
|
len(records) > 1 or any(len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts)
|
||||||
)
|
)
|
||||||
segments: Optional[List[SegmentInfo]] = None
|
segments: List[SegmentInfo] = []
|
||||||
|
segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = []
|
||||||
current_segment_index = 0
|
current_segment_index = 0
|
||||||
display_text = record_texts[0] if record_texts else text_content
|
display_text = record_texts[0] if record_texts else text_content
|
||||||
selected_payload = records[0][0] if records else None
|
selected_payload = records[0][0] if records else None
|
||||||
@@ -732,46 +897,13 @@ class AnnotationEditorService:
|
|||||||
display_text = "\n".join(record_texts) if record_texts else text_content
|
display_text = "\n".join(record_texts) if record_texts else text_content
|
||||||
|
|
||||||
if needs_segmentation:
|
if needs_segmentation:
|
||||||
splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
|
_, segment_contexts = self._build_segment_contexts(
|
||||||
segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = []
|
records,
|
||||||
segments = []
|
record_texts,
|
||||||
segment_cursor = 0
|
segment_annotation_keys,
|
||||||
|
)
|
||||||
for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)):
|
|
||||||
normalized_text = record_text or ""
|
|
||||||
if len(normalized_text) > self.SEGMENT_THRESHOLD:
|
|
||||||
raw_segments = splitter.split(normalized_text)
|
|
||||||
for chunk_index, seg in enumerate(raw_segments):
|
|
||||||
segments.append(SegmentInfo(
|
|
||||||
idx=segment_cursor,
|
|
||||||
text=seg["text"],
|
|
||||||
start=seg["start"],
|
|
||||||
end=seg["end"],
|
|
||||||
hasAnnotation=str(segment_cursor) in segment_annotation_keys,
|
|
||||||
lineIndex=record_index,
|
|
||||||
chunkIndex=chunk_index,
|
|
||||||
))
|
|
||||||
segment_contexts.append((payload, raw_text, seg["text"], record_index, chunk_index))
|
|
||||||
segment_cursor += 1
|
|
||||||
else:
|
|
||||||
segments.append(SegmentInfo(
|
|
||||||
idx=segment_cursor,
|
|
||||||
text=normalized_text,
|
|
||||||
start=0,
|
|
||||||
end=len(normalized_text),
|
|
||||||
hasAnnotation=str(segment_cursor) in segment_annotation_keys,
|
|
||||||
lineIndex=record_index,
|
|
||||||
chunkIndex=0,
|
|
||||||
))
|
|
||||||
segment_contexts.append((payload, raw_text, normalized_text, record_index, 0))
|
|
||||||
segment_cursor += 1
|
|
||||||
|
|
||||||
if not segments:
|
|
||||||
segments = [SegmentInfo(idx=0, text="", start=0, end=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)]
|
|
||||||
segment_contexts = [(None, "", "", 0, 0)]
|
|
||||||
|
|
||||||
current_segment_index = segment_index if segment_index is not None else 0
|
current_segment_index = segment_index if segment_index is not None else 0
|
||||||
if current_segment_index < 0 or current_segment_index >= len(segments):
|
if current_segment_index < 0 or current_segment_index >= len(segment_contexts):
|
||||||
current_segment_index = 0
|
current_segment_index = 0
|
||||||
|
|
||||||
selected_payload, _, display_text, _, _ = segment_contexts[current_segment_index]
|
selected_payload, _, display_text, _, _ = segment_contexts[current_segment_index]
|
||||||
@@ -849,8 +981,7 @@ class AnnotationEditorService:
|
|||||||
task=task,
|
task=task,
|
||||||
annotationUpdatedAt=annotation_updated_at,
|
annotationUpdatedAt=annotation_updated_at,
|
||||||
segmented=needs_segmentation,
|
segmented=needs_segmentation,
|
||||||
segments=segments,
|
totalSegments=len(segment_contexts) if needs_segmentation else 1,
|
||||||
totalSegments=len(segments) if segments else 1,
|
|
||||||
currentSegmentIndex=current_segment_index,
|
currentSegmentIndex=current_segment_index,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1185,3 +1316,195 @@ class AnnotationEditorService:
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning("标注同步知识管理失败:%s", exc)
|
logger.warning("标注同步知识管理失败:%s", exc)
|
||||||
|
|
||||||
|
async def precompute_segmentation_for_project(
|
||||||
|
self,
|
||||||
|
project_id: str,
|
||||||
|
max_retries: int = 3
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
为指定项目的所有文本文件预计算切片结构并持久化到数据库
|
||||||
|
|
||||||
|
Args:
|
||||||
|
project_id: 标注项目ID
|
||||||
|
max_retries: 失败重试次数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
统计信息:{total_files, succeeded, failed}
|
||||||
|
"""
|
||||||
|
project = await self._get_project_or_404(project_id)
|
||||||
|
dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
|
||||||
|
|
||||||
|
# 只处理文本数据集
|
||||||
|
if dataset_type != DATASET_TYPE_TEXT:
|
||||||
|
logger.info(f"项目 {project_id} 不是文本数据集,跳过切片预生成")
|
||||||
|
return {"total_files": 0, "succeeded": 0, "failed": 0}
|
||||||
|
|
||||||
|
# 检查是否启用分段
|
||||||
|
if not self._resolve_segmentation_enabled(project):
|
||||||
|
logger.info(f"项目 {project_id} 未启用分段,跳过切片预生成")
|
||||||
|
return {"total_files": 0, "succeeded": 0, "failed": 0}
|
||||||
|
|
||||||
|
# 获取项目的所有文本文件(排除源文档)
|
||||||
|
files_result = await self.db.execute(
|
||||||
|
select(DatasetFiles)
|
||||||
|
.join(LabelingProjectFile, LabelingProjectFile.file_id == DatasetFiles.id)
|
||||||
|
.where(
|
||||||
|
LabelingProjectFile.project_id == project_id,
|
||||||
|
DatasetFiles.dataset_id == project.dataset_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
file_records = files_result.scalars().all()
|
||||||
|
|
||||||
|
if not file_records:
|
||||||
|
logger.info(f"项目 {project_id} 没有文件,跳过切片预生成")
|
||||||
|
return {"total_files": 0, "succeeded": 0, "failed": 0}
|
||||||
|
|
||||||
|
# 过滤源文档文件
|
||||||
|
valid_files = []
|
||||||
|
for file_record in file_records:
|
||||||
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
||||||
|
file_name = str(getattr(file_record, "file_name", "")).lower()
|
||||||
|
is_source_document = (
|
||||||
|
file_type in SOURCE_DOCUMENT_TYPES or
|
||||||
|
any(file_name.endswith(ext) for ext in SOURCE_DOCUMENT_EXTENSIONS)
|
||||||
|
)
|
||||||
|
if not is_source_document:
|
||||||
|
valid_files.append(file_record)
|
||||||
|
|
||||||
|
total_files = len(valid_files)
|
||||||
|
succeeded = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
label_config = await self._resolve_project_label_config(project)
|
||||||
|
primary_text_key = self._resolve_primary_text_key(label_config)
|
||||||
|
|
||||||
|
for file_record in valid_files:
|
||||||
|
file_id = str(file_record.id) # type: ignore
|
||||||
|
file_name = str(getattr(file_record, "file_name", ""))
|
||||||
|
|
||||||
|
for retry in range(max_retries):
|
||||||
|
try:
|
||||||
|
# 读取文本内容
|
||||||
|
text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
|
||||||
|
if not isinstance(text_content, str):
|
||||||
|
logger.warning(f"文件 {file_id} 内容不是字符串,跳过切片")
|
||||||
|
failed += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
# 解析文本记录
|
||||||
|
records: List[Tuple[Optional[Dict[str, Any]], str]] = []
|
||||||
|
if file_name.lower().endswith(JSONL_EXTENSION):
|
||||||
|
records = self._parse_jsonl_records(text_content)
|
||||||
|
else:
|
||||||
|
parsed_payload = self._try_parse_json_payload(text_content)
|
||||||
|
if parsed_payload:
|
||||||
|
records = [(parsed_payload, text_content)]
|
||||||
|
|
||||||
|
if not records:
|
||||||
|
records = [(None, text_content)]
|
||||||
|
|
||||||
|
record_texts = [
|
||||||
|
self._resolve_primary_text_value(payload, raw_text, primary_text_key)
|
||||||
|
for payload, raw_text in records
|
||||||
|
]
|
||||||
|
if not record_texts:
|
||||||
|
record_texts = [text_content]
|
||||||
|
|
||||||
|
# 判断是否需要分段
|
||||||
|
needs_segmentation = len(records) > 1 or any(
|
||||||
|
len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
|
||||||
|
)
|
||||||
|
|
||||||
|
if not needs_segmentation:
|
||||||
|
# 不需要分段的文件,跳过
|
||||||
|
succeeded += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
# 执行切片
|
||||||
|
splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
|
||||||
|
segment_cursor = 0
|
||||||
|
segments = {}
|
||||||
|
|
||||||
|
for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)):
|
||||||
|
normalized_text = record_text or ""
|
||||||
|
|
||||||
|
if len(normalized_text) > self.SEGMENT_THRESHOLD:
|
||||||
|
raw_segments = splitter.split(normalized_text)
|
||||||
|
for chunk_index, seg in enumerate(raw_segments):
|
||||||
|
segments[str(segment_cursor)] = {
|
||||||
|
SEGMENT_RESULT_KEY: [],
|
||||||
|
SEGMENT_CREATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
|
||||||
|
SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
|
||||||
|
}
|
||||||
|
segment_cursor += 1
|
||||||
|
else:
|
||||||
|
segments[str(segment_cursor)] = {
|
||||||
|
SEGMENT_RESULT_KEY: [],
|
||||||
|
SEGMENT_CREATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
|
||||||
|
SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
|
||||||
|
}
|
||||||
|
segment_cursor += 1
|
||||||
|
|
||||||
|
if not segments:
|
||||||
|
succeeded += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
# 构造分段标注结构
|
||||||
|
final_payload = {
|
||||||
|
SEGMENTED_KEY: True,
|
||||||
|
"version": 1,
|
||||||
|
SEGMENTS_KEY: segments,
|
||||||
|
SEGMENT_TOTAL_KEY: segment_cursor,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 检查是否已存在标注
|
||||||
|
existing_result = await self.db.execute(
|
||||||
|
select(AnnotationResult).where(
|
||||||
|
AnnotationResult.project_id == project_id,
|
||||||
|
AnnotationResult.file_id == file_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
existing = existing_result.scalar_one_or_none()
|
||||||
|
|
||||||
|
now = datetime.utcnow()
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
# 更新现有标注
|
||||||
|
existing.annotation = final_payload # type: ignore[assignment]
|
||||||
|
existing.annotation_status = ANNOTATION_STATUS_IN_PROGRESS # type: ignore[assignment]
|
||||||
|
existing.updated_at = now # type: ignore[assignment]
|
||||||
|
else:
|
||||||
|
# 创建新标注记录
|
||||||
|
record = AnnotationResult(
|
||||||
|
id=str(uuid.uuid4()),
|
||||||
|
project_id=project_id,
|
||||||
|
file_id=file_id,
|
||||||
|
annotation=final_payload,
|
||||||
|
annotation_status=ANNOTATION_STATUS_IN_PROGRESS,
|
||||||
|
created_at=now,
|
||||||
|
updated_at=now,
|
||||||
|
)
|
||||||
|
self.db.add(record)
|
||||||
|
|
||||||
|
await self.db.commit()
|
||||||
|
succeeded += 1
|
||||||
|
logger.info(f"成功为文件 {file_id} 预生成 {segment_cursor} 个切片")
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"为文件 {file_id} 预生成切片失败 (重试 {retry + 1}/{max_retries}): {e}"
|
||||||
|
)
|
||||||
|
if retry == max_retries - 1:
|
||||||
|
failed += 1
|
||||||
|
await self.db.rollback()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"项目 {project_id} 切片预生成完成: 总计 {total_files}, 成功 {succeeded}, 失败 {failed}"
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"total_files": total_files,
|
||||||
|
"succeeded": succeeded,
|
||||||
|
"failed": failed,
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
from app.core.logging import get_logger
|
from app.core.logging import get_logger
|
||||||
from app.db.models import Dataset, DatasetFiles, LabelingProject
|
from app.db.models import Dataset, DatasetFiles, LabelingProject
|
||||||
from app.module.annotation.service.text_fetcher import fetch_text_content_via_download_api
|
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
@@ -77,15 +76,18 @@ class KnowledgeSyncService:
|
|||||||
|
|
||||||
if set_id:
|
if set_id:
|
||||||
exists = await self._get_knowledge_set(set_id)
|
exists = await self._get_knowledge_set(set_id)
|
||||||
if exists:
|
if exists and self._metadata_matches_project(exists.get("metadata"), project.id):
|
||||||
return set_id
|
return set_id
|
||||||
logger.warning("知识集不存在,准备重建:set_id=%s", set_id)
|
logger.warning(
|
||||||
|
"知识集不存在或归属不匹配,准备重建:set_id=%s project_id=%s",
|
||||||
|
set_id,
|
||||||
|
project.id,
|
||||||
|
)
|
||||||
|
|
||||||
dataset_name = project.name or "annotation-project"
|
project_name = (project.name or "annotation-project").strip() or "annotation-project"
|
||||||
base_name = dataset_name.strip() or "annotation-project"
|
|
||||||
metadata = self._build_set_metadata(project)
|
metadata = self._build_set_metadata(project)
|
||||||
|
|
||||||
existing = await self._find_knowledge_set_by_name(base_name)
|
existing = await self._find_knowledge_set_by_name_and_project(project_name, project.id)
|
||||||
if existing:
|
if existing:
|
||||||
await self._update_project_config(
|
await self._update_project_config(
|
||||||
project,
|
project,
|
||||||
@@ -96,19 +98,19 @@ class KnowledgeSyncService:
|
|||||||
)
|
)
|
||||||
return existing.get("id")
|
return existing.get("id")
|
||||||
|
|
||||||
created = await self._create_knowledge_set(base_name, metadata)
|
created = await self._create_knowledge_set(project_name, metadata)
|
||||||
if not created:
|
if not created:
|
||||||
created = await self._find_knowledge_set_by_name(base_name)
|
created = await self._find_knowledge_set_by_name_and_project(project_name, project.id)
|
||||||
|
|
||||||
if not created:
|
if not created:
|
||||||
fallback_name = self._build_fallback_set_name(base_name, project.id)
|
fallback_name = self._build_fallback_set_name(project_name, project.id)
|
||||||
existing = await self._find_knowledge_set_by_name(fallback_name)
|
existing = await self._find_knowledge_set_by_name_and_project(fallback_name, project.id)
|
||||||
if existing:
|
if existing:
|
||||||
created = existing
|
created = existing
|
||||||
else:
|
else:
|
||||||
created = await self._create_knowledge_set(fallback_name, metadata)
|
created = await self._create_knowledge_set(fallback_name, metadata)
|
||||||
if not created:
|
if not created:
|
||||||
created = await self._find_knowledge_set_by_name(fallback_name)
|
created = await self._find_knowledge_set_by_name_and_project(fallback_name, project.id)
|
||||||
|
|
||||||
if not created:
|
if not created:
|
||||||
return None
|
return None
|
||||||
@@ -153,16 +155,18 @@ class KnowledgeSyncService:
|
|||||||
return []
|
return []
|
||||||
return [item for item in content if isinstance(item, dict)]
|
return [item for item in content if isinstance(item, dict)]
|
||||||
|
|
||||||
async def _find_knowledge_set_by_name(self, name: str) -> Optional[Dict[str, Any]]:
|
async def _find_knowledge_set_by_name_and_project(self, name: str, project_id: str) -> Optional[Dict[str, Any]]:
|
||||||
if not name:
|
if not name:
|
||||||
return None
|
return None
|
||||||
items = await self._list_knowledge_sets(name)
|
items = await self._list_knowledge_sets(name)
|
||||||
if not items:
|
if not items:
|
||||||
return None
|
return None
|
||||||
exact_matches = [item for item in items if item.get("name") == name]
|
for item in items:
|
||||||
if not exact_matches:
|
if item.get("name") != name:
|
||||||
|
continue
|
||||||
|
if self._metadata_matches_project(item.get("metadata"), project_id):
|
||||||
|
return item
|
||||||
return None
|
return None
|
||||||
return exact_matches[0]
|
|
||||||
|
|
||||||
async def _create_knowledge_set(self, name: str, metadata: str) -> Optional[Dict[str, Any]]:
|
async def _create_knowledge_set(self, name: str, metadata: str) -> Optional[Dict[str, Any]]:
|
||||||
payload = {
|
payload = {
|
||||||
@@ -249,16 +253,6 @@ class KnowledgeSyncService:
|
|||||||
content_type = "MARKDOWN"
|
content_type = "MARKDOWN"
|
||||||
|
|
||||||
content = annotation_json
|
content = annotation_json
|
||||||
if dataset_type == "TEXT":
|
|
||||||
try:
|
|
||||||
content = await fetch_text_content_via_download_api(
|
|
||||||
project.dataset_id,
|
|
||||||
str(file_record.id),
|
|
||||||
)
|
|
||||||
content = self._append_annotation_to_content(content, annotation_json, content_type)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("读取文本失败,改为仅存标注JSON:%s", exc)
|
|
||||||
content = annotation_json
|
|
||||||
|
|
||||||
payload: Dict[str, Any] = {
|
payload: Dict[str, Any] = {
|
||||||
"title": title,
|
"title": title,
|
||||||
@@ -289,13 +283,6 @@ class KnowledgeSyncService:
|
|||||||
extension = file_type
|
extension = file_type
|
||||||
return extension.lower() in {"md", "markdown"}
|
return extension.lower() in {"md", "markdown"}
|
||||||
|
|
||||||
def _append_annotation_to_content(self, content: str, annotation_json: str, content_type: str) -> str:
|
|
||||||
if content_type == "MARKDOWN":
|
|
||||||
return (
|
|
||||||
f"{content}\n\n---\n\n## 标注结果\n\n```json\n"
|
|
||||||
f"{annotation_json}\n```")
|
|
||||||
return f"{content}\n\n---\n\n标注结果(JSON):\n{annotation_json}"
|
|
||||||
|
|
||||||
def _strip_extension(self, file_name: str) -> str:
|
def _strip_extension(self, file_name: str) -> str:
|
||||||
if not file_name:
|
if not file_name:
|
||||||
return ""
|
return ""
|
||||||
@@ -359,6 +346,27 @@ class KnowledgeSyncService:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return json.dumps({"error": "failed to serialize"}, ensure_ascii=False)
|
return json.dumps({"error": "failed to serialize"}, ensure_ascii=False)
|
||||||
|
|
||||||
|
def _metadata_matches_project(self, metadata: Any, project_id: str) -> bool:
|
||||||
|
if not project_id:
|
||||||
|
return False
|
||||||
|
parsed = self._parse_metadata(metadata)
|
||||||
|
if not parsed:
|
||||||
|
return False
|
||||||
|
return str(parsed.get("project_id") or "").strip() == project_id
|
||||||
|
|
||||||
|
def _parse_metadata(self, metadata: Any) -> Optional[Dict[str, Any]]:
|
||||||
|
if metadata is None:
|
||||||
|
return None
|
||||||
|
if isinstance(metadata, dict):
|
||||||
|
return metadata
|
||||||
|
if isinstance(metadata, str):
|
||||||
|
try:
|
||||||
|
payload = json.loads(metadata)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return payload if isinstance(payload, dict) else None
|
||||||
|
return None
|
||||||
|
|
||||||
def _safe_response_text(self, response: httpx.Response) -> str:
|
def _safe_response_text(self, response: httpx.Response) -> str:
|
||||||
try:
|
try:
|
||||||
return response.text
|
return response.text
|
||||||
|
|||||||
@@ -19,23 +19,24 @@ async def fetch_text_content_via_download_api(dataset_id: str, file_id: str) ->
|
|||||||
resp = await client.get(url)
|
resp = await client.get(url)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
max_bytes = settings.editor_max_text_bytes
|
||||||
content_length = resp.headers.get("content-length")
|
content_length = resp.headers.get("content-length")
|
||||||
if content_length:
|
if max_bytes > 0 and content_length:
|
||||||
try:
|
try:
|
||||||
if int(content_length) > settings.editor_max_text_bytes:
|
if int(content_length) > max_bytes:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=413,
|
status_code=413,
|
||||||
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
|
detail=f"文本文件过大,限制 {max_bytes} 字节",
|
||||||
)
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# content-length 非法则忽略,走实际长度判断
|
# content-length 非法则忽略,走实际长度判断
|
||||||
pass
|
pass
|
||||||
|
|
||||||
data = resp.content
|
data = resp.content
|
||||||
if len(data) > settings.editor_max_text_bytes:
|
if max_bytes > 0 and len(data) > max_bytes:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=413,
|
status_code=413,
|
||||||
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
|
detail=f"文本文件过大,限制 {max_bytes} 字节",
|
||||||
)
|
)
|
||||||
|
|
||||||
# TEXT POC:默认按 UTF-8 解码,不可解码字符用替换符处理
|
# TEXT POC:默认按 UTF-8 解码,不可解码字符用替换符处理
|
||||||
|
|||||||
@@ -375,9 +375,9 @@ def _register_output_dataset(
|
|||||||
insert_file_sql = text(
|
insert_file_sql = text(
|
||||||
"""
|
"""
|
||||||
INSERT INTO t_dm_dataset_files (
|
INSERT INTO t_dm_dataset_files (
|
||||||
id, dataset_id, file_name, file_path, file_type, file_size, status
|
id, dataset_id, file_name, file_path, logical_path, version, file_type, file_size, status
|
||||||
) VALUES (
|
) VALUES (
|
||||||
:id, :dataset_id, :file_name, :file_path, :file_type, :file_size, :status
|
:id, :dataset_id, :file_name, :file_path, :logical_path, :version, :file_type, :file_size, :status
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
@@ -395,6 +395,7 @@ def _register_output_dataset(
|
|||||||
|
|
||||||
for file_name, file_path, file_size in image_files:
|
for file_name, file_path, file_size in image_files:
|
||||||
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
|
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
|
||||||
|
logical_path = os.path.relpath(file_path, output_dir).replace("\\", "/")
|
||||||
conn.execute(
|
conn.execute(
|
||||||
insert_file_sql,
|
insert_file_sql,
|
||||||
{
|
{
|
||||||
@@ -402,6 +403,8 @@ def _register_output_dataset(
|
|||||||
"dataset_id": output_dataset_id,
|
"dataset_id": output_dataset_id,
|
||||||
"file_name": file_name,
|
"file_name": file_name,
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
|
"logical_path": logical_path,
|
||||||
|
"version": 1,
|
||||||
"file_type": ext,
|
"file_type": ext,
|
||||||
"file_size": int(file_size),
|
"file_size": int(file_size),
|
||||||
"status": "ACTIVE",
|
"status": "ACTIVE",
|
||||||
@@ -411,6 +414,7 @@ def _register_output_dataset(
|
|||||||
|
|
||||||
for file_name, file_path, file_size in annotation_files:
|
for file_name, file_path, file_size in annotation_files:
|
||||||
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
|
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
|
||||||
|
logical_path = os.path.relpath(file_path, output_dir).replace("\\", "/")
|
||||||
conn.execute(
|
conn.execute(
|
||||||
insert_file_sql,
|
insert_file_sql,
|
||||||
{
|
{
|
||||||
@@ -418,6 +422,8 @@ def _register_output_dataset(
|
|||||||
"dataset_id": output_dataset_id,
|
"dataset_id": output_dataset_id,
|
||||||
"file_name": file_name,
|
"file_name": file_name,
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
|
"logical_path": logical_path,
|
||||||
|
"version": 1,
|
||||||
"file_type": ext,
|
"file_type": ext,
|
||||||
"file_size": int(file_size),
|
"file_size": int(file_size),
|
||||||
"status": "ACTIVE",
|
"status": "ACTIVE",
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
{
|
{
|
||||||
"query_sql": "SELECT * FROM t_task_instance_info WHERE instance_id IN (:instance_id)",
|
"query_sql": "SELECT * FROM t_task_instance_info WHERE instance_id IN (:instance_id)",
|
||||||
"insert_sql": "INSERT INTO t_task_instance_info (instance_id, meta_file_name, meta_file_type, meta_file_id, meta_file_size, file_id, file_size, file_type, file_name, file_path, status, operator_id, error_code, incremental, child_id, slice_num) VALUES (:instance_id, :meta_file_name, :meta_file_type, :meta_file_id, :meta_file_size, :file_id, :file_size, :file_type, :file_name, :file_path, :status, :operator_id, :error_code, :incremental, :child_id, :slice_num)",
|
"insert_sql": "INSERT INTO t_task_instance_info (instance_id, meta_file_name, meta_file_type, meta_file_id, meta_file_size, file_id, file_size, file_type, file_name, file_path, status, operator_id, error_code, incremental, child_id, slice_num) VALUES (:instance_id, :meta_file_name, :meta_file_type, :meta_file_id, :meta_file_size, :file_id, :file_size, :file_type, :file_name, :file_path, :status, :operator_id, :error_code, :incremental, :child_id, :slice_num)",
|
||||||
"insert_dataset_file_sql": "INSERT INTO t_dm_dataset_files (id, dataset_id, file_name, file_path, file_type, file_size, status, upload_time, last_access_time, created_at, updated_at) VALUES (:id, :dataset_id, :file_name, :file_path, :file_type, :file_size, :status, :upload_time, :last_access_time, :created_at, :updated_at)",
|
"insert_dataset_file_sql": "INSERT INTO t_dm_dataset_files (id, dataset_id, file_name, file_path, logical_path, version, file_type, file_size, status, upload_time, last_access_time, created_at, updated_at) VALUES (:id, :dataset_id, :file_name, :file_path, :logical_path, :version, :file_type, :file_size, :status, :upload_time, :last_access_time, :created_at, :updated_at)",
|
||||||
"insert_clean_result_sql": "INSERT INTO t_clean_result (instance_id, src_file_id, dest_file_id, src_name, dest_name, src_type, dest_type, src_size, dest_size, status, result) VALUES (:instance_id, :src_file_id, :dest_file_id, :src_name, :dest_name, :src_type, :dest_type, :src_size, :dest_size, :status, :result)",
|
"insert_clean_result_sql": "INSERT INTO t_clean_result (instance_id, src_file_id, dest_file_id, src_name, dest_name, src_type, dest_type, src_size, dest_size, status, result) VALUES (:instance_id, :src_file_id, :dest_file_id, :src_name, :dest_name, :src_type, :dest_type, :src_size, :dest_size, :status, :result)",
|
||||||
"query_dataset_sql": "SELECT file_size FROM t_dm_dataset_files WHERE dataset_id = :dataset_id",
|
"query_dataset_sql": "SELECT file_size FROM t_dm_dataset_files WHERE dataset_id = :dataset_id AND (status IS NULL OR status <> 'ARCHIVED')",
|
||||||
"update_dataset_sql": "UPDATE t_dm_datasets SET size_bytes = :total_size, file_count = :file_count WHERE id = :dataset_id;",
|
"update_dataset_sql": "UPDATE t_dm_datasets SET size_bytes = :total_size, file_count = :file_count WHERE id = :dataset_id;",
|
||||||
"update_task_sql": "UPDATE t_clean_task SET status = :status, after_size = :total_size, finished_at = :finished_time WHERE id = :task_id",
|
"update_task_sql": "UPDATE t_clean_task SET status = :status, after_size = :total_size, finished_at = :finished_time WHERE id = :task_id",
|
||||||
"create_tables_sql": "CREATE TABLE IF NOT EXISTS t_task_instance_info (instance_id VARCHAR(255), meta_file_name TEXT, meta_file_type VARCHAR(100), meta_file_id BIGINT, meta_file_size VARCHAR(100), file_id BIGINT, file_size VARCHAR(100), file_type VARCHAR(100), file_name TEXT, file_path TEXT, status INT, operator_id VARCHAR(255), error_code VARCHAR(100), incremental VARCHAR(50), child_id BIGINT, slice_num INT DEFAULT 0);",
|
"create_tables_sql": "CREATE TABLE IF NOT EXISTS t_task_instance_info (instance_id VARCHAR(255), meta_file_name TEXT, meta_file_type VARCHAR(100), meta_file_id BIGINT, meta_file_size VARCHAR(100), file_id BIGINT, file_size VARCHAR(100), file_type VARCHAR(100), file_name TEXT, file_path TEXT, status INT, operator_id VARCHAR(255), error_code VARCHAR(100), incremental VARCHAR(50), child_id BIGINT, slice_num INT DEFAULT 0);",
|
||||||
|
|||||||
@@ -54,19 +54,22 @@ CREATE TABLE IF NOT EXISTS t_dm_dataset_files (
|
|||||||
dataset_id VARCHAR(36) NOT NULL COMMENT '所属数据集ID(UUID)',
|
dataset_id VARCHAR(36) NOT NULL COMMENT '所属数据集ID(UUID)',
|
||||||
file_name VARCHAR(255) NOT NULL COMMENT '文件名',
|
file_name VARCHAR(255) NOT NULL COMMENT '文件名',
|
||||||
file_path VARCHAR(1000) NOT NULL COMMENT '文件路径',
|
file_path VARCHAR(1000) NOT NULL COMMENT '文件路径',
|
||||||
|
logical_path VARCHAR(1000) NOT NULL COMMENT '文件逻辑路径(相对数据集根目录)',
|
||||||
|
version BIGINT NOT NULL DEFAULT 1 COMMENT '文件版本号(同 logical_path 递增)',
|
||||||
file_type VARCHAR(50) COMMENT '文件格式:JPG/PNG/DCM/TXT等',
|
file_type VARCHAR(50) COMMENT '文件格式:JPG/PNG/DCM/TXT等',
|
||||||
file_size BIGINT DEFAULT 0 COMMENT '文件大小(字节)',
|
file_size BIGINT DEFAULT 0 COMMENT '文件大小(字节)',
|
||||||
check_sum VARCHAR(64) COMMENT '文件校验和',
|
check_sum VARCHAR(64) COMMENT '文件校验和',
|
||||||
tags JSON COMMENT '文件标签信息',
|
tags JSON COMMENT '文件标签信息',
|
||||||
tags_updated_at TIMESTAMP NULL COMMENT '标签最后更新时间',
|
tags_updated_at TIMESTAMP NULL COMMENT '标签最后更新时间',
|
||||||
metadata JSON COMMENT '文件元数据',
|
metadata JSON COMMENT '文件元数据',
|
||||||
status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/DELETED/PROCESSING',
|
status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/ARCHIVED/DELETED/PROCESSING',
|
||||||
upload_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传时间',
|
upload_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传时间',
|
||||||
last_access_time TIMESTAMP NULL COMMENT '最后访问时间',
|
last_access_time TIMESTAMP NULL COMMENT '最后访问时间',
|
||||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||||
FOREIGN KEY (dataset_id) REFERENCES t_dm_datasets(id) ON DELETE CASCADE,
|
FOREIGN KEY (dataset_id) REFERENCES t_dm_datasets(id) ON DELETE CASCADE,
|
||||||
INDEX idx_dm_dataset (dataset_id),
|
INDEX idx_dm_dataset (dataset_id),
|
||||||
|
INDEX idx_dm_dataset_logical_path (dataset_id, logical_path, version),
|
||||||
INDEX idx_dm_file_type (file_type),
|
INDEX idx_dm_file_type (file_type),
|
||||||
INDEX idx_dm_file_status (status),
|
INDEX idx_dm_file_status (status),
|
||||||
INDEX idx_dm_upload_time (upload_time)
|
INDEX idx_dm_upload_time (upload_time)
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ RUN npm config set registry https://registry.npmmirror.com && \
|
|||||||
|
|
||||||
##### RUNNER
|
##### RUNNER
|
||||||
|
|
||||||
FROM gcr.io/distroless/nodejs20-debian12 AS runner
|
FROM gcr.nju.edu.cn/distroless/nodejs20-debian12 AS runner
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
ENV NODE_ENV=production
|
ENV NODE_ENV=production
|
||||||
|
|||||||
93
scripts/offline/Dockerfile.backend-python.offline
Normal file
93
scripts/offline/Dockerfile.backend-python.offline
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
# backend-python Dockerfile 离线版本
|
||||||
|
# 修改点: 使用本地 DataX 源码替代 git clone
|
||||||
|
|
||||||
|
FROM maven:3-eclipse-temurin-8 AS datax-builder
|
||||||
|
|
||||||
|
# 配置 Maven 阿里云镜像
|
||||||
|
RUN mkdir -p /root/.m2 && \
|
||||||
|
echo '<?xml version="1.0" encoding="UTF-8"?>\n\
|
||||||
|
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"\n\
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n\
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">\n\
|
||||||
|
<mirrors>\n\
|
||||||
|
<mirror>\n\
|
||||||
|
<id>aliyunmaven</id>\n\
|
||||||
|
<mirrorOf>*</mirrorOf>\n\
|
||||||
|
<name>阿里云公共仓库</name>\n\
|
||||||
|
<url>https://maven.aliyun.com/repository/public</url>\n\
|
||||||
|
</mirror>\n\
|
||||||
|
</mirrors>\n\
|
||||||
|
</settings>' > /root/.m2/settings.xml
|
||||||
|
|
||||||
|
# 离线模式: 从构建参数获取本地 DataX 路径
|
||||||
|
ARG DATAX_LOCAL_PATH=./build-cache/resources/DataX
|
||||||
|
|
||||||
|
# 复制本地 DataX 源码(离线环境预先下载)
|
||||||
|
COPY ${DATAX_LOCAL_PATH} /DataX
|
||||||
|
|
||||||
|
COPY runtime/datax/ DataX/
|
||||||
|
|
||||||
|
RUN cd DataX && \
|
||||||
|
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
|
||||||
|
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
|
||||||
|
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
|
||||||
|
|
||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
# 配置 apt 阿里云镜像源
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends vim openjdk-21-jre nfs-common glusterfs-client rsync && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
POETRY_VERSION=2.2.1 \
|
||||||
|
POETRY_NO_INTERACTION=1 \
|
||||||
|
POETRY_VIRTUALENVS_CREATE=false \
|
||||||
|
POETRY_CACHE_DIR=/tmp/poetry_cache
|
||||||
|
|
||||||
|
ENV JAVA_HOME=/usr/lib/jvm/java-21-openjdk
|
||||||
|
|
||||||
|
ENV PATH="/root/.local/bin:$JAVA_HOME/bin:$PATH"
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 配置 pip 阿里云镜像并安装 Poetry
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && \
|
||||||
|
pip config set global.trusted-host mirrors.aliyun.com && \
|
||||||
|
pip install --upgrade --root-user-action=ignore pip \
|
||||||
|
&& pip install --root-user-action=ignore pipx \
|
||||||
|
&& pipx install "poetry==$POETRY_VERSION"
|
||||||
|
|
||||||
|
COPY --from=datax-builder /DataX/target/datax/datax /opt/datax
|
||||||
|
RUN cp /opt/datax/plugin/reader/mysqlreader/libs/mysql* /opt/datax/plugin/reader/starrocksreader/libs/
|
||||||
|
|
||||||
|
# Copy only dependency files first
|
||||||
|
COPY runtime/datamate-python/pyproject.toml runtime/datamate-python/poetry.lock* /app/
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
|
||||||
|
poetry install --no-root --only main
|
||||||
|
|
||||||
|
# 离线模式: 使用本地 NLTK 数据
|
||||||
|
ARG NLTK_DATA_LOCAL_PATH=./build-cache/resources/nltk_data
|
||||||
|
COPY ${NLTK_DATA_LOCAL_PATH} /usr/local/nltk_data
|
||||||
|
|
||||||
|
ENV NLTK_DATA=/usr/local/nltk_data
|
||||||
|
|
||||||
|
# Copy the rest of the application
|
||||||
|
COPY runtime/datamate-python /app
|
||||||
|
|
||||||
|
COPY runtime/datamate-python/deploy/docker-entrypoint.sh /docker-entrypoint.sh
|
||||||
|
RUN chmod +x /docker-entrypoint.sh || true
|
||||||
|
|
||||||
|
# Expose the application port
|
||||||
|
EXPOSE 18000
|
||||||
|
|
||||||
|
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||||
82
scripts/offline/Dockerfile.backend-python.offline-v2
Normal file
82
scripts/offline/Dockerfile.backend-python.offline-v2
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
# backend-python Dockerfile 离线版本 v2
|
||||||
|
FROM maven:3-eclipse-temurin-8 AS datax-builder
|
||||||
|
|
||||||
|
# 配置 Maven 阿里云镜像
|
||||||
|
RUN mkdir -p /root/.m2 && \
|
||||||
|
echo '<?xml version="1.0" encoding="UTF-8"?>\n\
|
||||||
|
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"\n\
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n\
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">\n\
|
||||||
|
<mirrors>\n\
|
||||||
|
<mirror>\n\
|
||||||
|
<id>aliyunmaven</id>\n\
|
||||||
|
<mirrorOf>*</mirrorOf>\n\
|
||||||
|
<name>阿里云公共仓库</name>\n\
|
||||||
|
<url>https://maven.aliyun.com/repository/public</url>\n\
|
||||||
|
</mirror>\n\
|
||||||
|
</mirrors>\n\
|
||||||
|
</settings>' > /root/.m2/settings.xml
|
||||||
|
|
||||||
|
# 离线模式: 从构建参数获取本地 DataX 路径
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG DATAX_LOCAL_PATH=${RESOURCES_DIR}/DataX
|
||||||
|
|
||||||
|
# 复制本地 DataX 源码
|
||||||
|
COPY ${DATAX_LOCAL_PATH} /DataX
|
||||||
|
|
||||||
|
COPY runtime/datax/ DataX/
|
||||||
|
|
||||||
|
RUN cd DataX && \
|
||||||
|
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
|
||||||
|
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
|
||||||
|
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
|
||||||
|
|
||||||
|
# 使用预装 APT 包的基础镜像
|
||||||
|
FROM datamate-python-base:latest
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
POETRY_VERSION=2.2.1 \
|
||||||
|
POETRY_NO_INTERACTION=1 \
|
||||||
|
POETRY_VIRTUALENVS_CREATE=false \
|
||||||
|
POETRY_CACHE_DIR=/tmp/poetry_cache
|
||||||
|
|
||||||
|
ENV JAVA_HOME=/usr/lib/jvm/java-21-openjdk
|
||||||
|
ENV PATH="/root/.local/bin:$JAVA_HOME/bin:$PATH"
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 配置 pip 阿里云镜像并安装 Poetry
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && \
|
||||||
|
pip config set global.trusted-host mirrors.aliyun.com && \
|
||||||
|
pip install --upgrade --root-user-action=ignore pip \
|
||||||
|
&& pip install --root-user-action=ignore pipx \
|
||||||
|
&& pipx install "poetry==$POETRY_VERSION"
|
||||||
|
|
||||||
|
COPY --from=datax-builder /DataX/target/datax/datax /opt/datax
|
||||||
|
RUN cp /opt/datax/plugin/reader/mysqlreader/libs/mysql* /opt/datax/plugin/reader/starrocksreader/libs/
|
||||||
|
|
||||||
|
# Copy only dependency files first
|
||||||
|
COPY runtime/datamate-python/pyproject.toml runtime/datamate-python/poetry.lock* /app/
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN --mount=type=cache,target=$POETRY_CACHE_DIR \
|
||||||
|
poetry install --no-root --only main
|
||||||
|
|
||||||
|
# 离线模式: 使用本地 NLTK 数据
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG NLTK_DATA_LOCAL_PATH=${RESOURCES_DIR}/nltk_data
|
||||||
|
COPY ${NLTK_DATA_LOCAL_PATH} /usr/local/nltk_data
|
||||||
|
|
||||||
|
ENV NLTK_DATA=/usr/local/nltk_data
|
||||||
|
|
||||||
|
# Copy the rest of the application
|
||||||
|
COPY runtime/datamate-python /app
|
||||||
|
|
||||||
|
COPY runtime/datamate-python/deploy/docker-entrypoint.sh /docker-entrypoint.sh
|
||||||
|
RUN chmod +x /docker-entrypoint.sh || true
|
||||||
|
|
||||||
|
EXPOSE 18000
|
||||||
|
|
||||||
|
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||||
71
scripts/offline/Dockerfile.backend.offline
Normal file
71
scripts/offline/Dockerfile.backend.offline
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
# backend Dockerfile 离线版本
|
||||||
|
# 使用预装 APT 包的基础镜像
|
||||||
|
|
||||||
|
FROM maven:3-eclipse-temurin-21 AS builder
|
||||||
|
|
||||||
|
# 配置 Maven 阿里云镜像
|
||||||
|
RUN mkdir -p /root/.m2 && \
|
||||||
|
echo '<?xml version="1.0" encoding="UTF-8"?>\n\
|
||||||
|
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"\n\
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n\
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">\n\
|
||||||
|
<mirrors>\n\
|
||||||
|
<mirror>\n\
|
||||||
|
<id>aliyunmaven</id>\n\
|
||||||
|
<mirrorOf>*</mirrorOf>\n\
|
||||||
|
<name>阿里云公共仓库</name>\n\
|
||||||
|
<url>https://maven.aliyun.com/repository/public</url>\n\
|
||||||
|
</mirror>\n\
|
||||||
|
</mirrors>\n\
|
||||||
|
</settings>' > /root/.m2/settings.xml
|
||||||
|
|
||||||
|
WORKDIR /opt/backend
|
||||||
|
|
||||||
|
# 先复制所有 pom.xml 文件
|
||||||
|
COPY backend/pom.xml ./
|
||||||
|
COPY backend/services/pom.xml ./services/
|
||||||
|
COPY backend/shared/domain-common/pom.xml ./shared/domain-common/
|
||||||
|
COPY backend/shared/security-common/pom.xml ./shared/security-common/
|
||||||
|
COPY backend/services/data-annotation-service/pom.xml ./services/data-annotation-service/
|
||||||
|
COPY backend/services/data-cleaning-service/pom.xml ./services/data-cleaning-service/
|
||||||
|
COPY backend/services/data-evaluation-service/pom.xml ./services/data-evaluation-service/
|
||||||
|
COPY backend/services/data-management-service/pom.xml ./services/data-management-service/
|
||||||
|
COPY backend/services/data-synthesis-service/pom.xml ./services/data-synthesis-service/
|
||||||
|
COPY backend/services/execution-engine-service/pom.xml ./services/execution-engine-service/
|
||||||
|
COPY backend/services/main-application/pom.xml ./services/main-application/
|
||||||
|
COPY backend/services/operator-market-service/pom.xml ./services/operator-market-service/
|
||||||
|
COPY backend/services/pipeline-orchestration-service/pom.xml ./services/pipeline-orchestration-service/
|
||||||
|
COPY backend/services/rag-indexer-service/pom.xml ./services/rag-indexer-service/
|
||||||
|
COPY backend/services/rag-query-service/pom.xml ./services/rag-query-service/
|
||||||
|
|
||||||
|
# 使用缓存卷下载依赖
|
||||||
|
RUN --mount=type=cache,target=/root/.m2/repository \
|
||||||
|
cd /opt/backend/services && \
|
||||||
|
mvn dependency:go-offline -Dmaven.test.skip=true || true
|
||||||
|
|
||||||
|
# 复制所有源代码
|
||||||
|
COPY backend/ /opt/backend
|
||||||
|
|
||||||
|
# 编译打包
|
||||||
|
RUN --mount=type=cache,target=/root/.m2/repository \
|
||||||
|
cd /opt/backend/services && \
|
||||||
|
mvn clean package -Dmaven.test.skip=true
|
||||||
|
|
||||||
|
# 使用预装 APT 包的基础镜像
|
||||||
|
FROM datamate-java-base:latest
|
||||||
|
|
||||||
|
# 不再执行 apt-get update,因为基础镜像已经预装了所有需要的包
|
||||||
|
# 如果需要添加额外的包,可以在这里添加,但离线环境下会失败
|
||||||
|
|
||||||
|
COPY --from=builder /opt/backend/services/main-application/target/datamate.jar /opt/backend/datamate.jar
|
||||||
|
COPY scripts/images/backend/start.sh /opt/backend/start.sh
|
||||||
|
COPY runtime/ops/examples/test_operator/test_operator.tar /opt/backend/test_operator.tar
|
||||||
|
|
||||||
|
RUN dos2unix /opt/backend/start.sh \
|
||||||
|
&& chmod +x /opt/backend/start.sh \
|
||||||
|
&& ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||||
|
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/backend/start.sh"]
|
||||||
|
CMD ["java", "-Duser.timezone=Asia/Shanghai", "-jar", "/opt/backend/datamate.jar"]
|
||||||
62
scripts/offline/Dockerfile.base-images
Normal file
62
scripts/offline/Dockerfile.base-images
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# 预安装 APT 包的基础镜像
|
||||||
|
# 在有网环境构建这些镜像,在无网环境作为基础镜像使用
|
||||||
|
|
||||||
|
# ==================== backend / gateway 基础镜像 ====================
|
||||||
|
FROM eclipse-temurin:21-jdk AS datamate-java-base
|
||||||
|
|
||||||
|
# 配置 apt 阿里云镜像源
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list.d/ubuntu.sources ]; then \
|
||||||
|
sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g; s/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list.d/ubuntu.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g; s/archive.ubuntu.com/mirrors.aliyun.com/g; s/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y vim wget curl rsync python3 python3-pip python-is-python3 dos2unix libreoffice fonts-noto-cjk && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# ==================== backend-python 基础镜像 ====================
|
||||||
|
FROM python:3.12-slim AS datamate-python-base
|
||||||
|
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends vim openjdk-21-jre nfs-common glusterfs-client rsync && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# ==================== runtime 基础镜像 ====================
|
||||||
|
FROM ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm AS datamate-runtime-base
|
||||||
|
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt update && \
|
||||||
|
apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix swig poppler-utils tesseract-ocr && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# ==================== deer-flow-backend 基础镜像 ====================
|
||||||
|
FROM ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm AS deer-flow-backend-base
|
||||||
|
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt-get update && apt-get install -y libpq-dev git && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# ==================== mineru 基础镜像 ====================
|
||||||
|
FROM python:3.11-slim AS mineru-base
|
||||||
|
|
||||||
|
RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y curl vim libgl1 libglx0 libopengl0 libglib2.0-0 procps && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
44
scripts/offline/Dockerfile.deer-flow-backend.offline
Normal file
44
scripts/offline/Dockerfile.deer-flow-backend.offline
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
# deer-flow-backend Dockerfile 离线版本
|
||||||
|
# 修改点: 使用本地 deer-flow 源码替代 git clone
|
||||||
|
|
||||||
|
FROM ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm
|
||||||
|
|
||||||
|
# Install uv.
|
||||||
|
COPY --from=ghcr.nju.edu.cn/astral-sh/uv:latest /uv /bin/uv
|
||||||
|
|
||||||
|
# 配置 apt 阿里云镜像源并安装系统依赖
|
||||||
|
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi && \
|
||||||
|
apt-get update && apt-get install -y \
|
||||||
|
libpq-dev git \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# 配置 uv 使用阿里云 PyPI 镜像
|
||||||
|
ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/"
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 离线模式: 本地 deer-flow 路径
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG DEERFLOW_DIR=${RESOURCES_DIR}/deer-flow
|
||||||
|
|
||||||
|
# 复制本地 deer-flow 源码(离线环境预先下载)
|
||||||
|
COPY ${DEERFLOW_DIR} /app
|
||||||
|
COPY runtime/deer-flow/.env /app/.env
|
||||||
|
COPY runtime/deer-flow/conf.yaml /app/conf.yaml
|
||||||
|
|
||||||
|
# Pre-cache the application dependencies.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv sync --locked --no-install-project
|
||||||
|
|
||||||
|
# Install the application dependencies.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv sync --locked
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Run the application.
|
||||||
|
CMD ["uv", "run", "--no-sync", "python", "server.py", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
75
scripts/offline/Dockerfile.deer-flow-frontend.offline
Normal file
75
scripts/offline/Dockerfile.deer-flow-frontend.offline
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
# deer-flow-frontend Dockerfile 离线版本
|
||||||
|
# 修改点: 使用本地 deer-flow 源码替代 git clone
|
||||||
|
|
||||||
|
##### DEPENDENCIES
|
||||||
|
|
||||||
|
FROM node:20-alpine AS deps
|
||||||
|
RUN apk add --no-cache libc6-compat openssl
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 离线模式: 本地 deer-flow 路径
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG DEERFLOW_DIR=${RESOURCES_DIR}/deer-flow
|
||||||
|
|
||||||
|
# 复制本地 deer-flow 源码
|
||||||
|
COPY ${DEERFLOW_DIR}/web /app
|
||||||
|
|
||||||
|
# 配置 npm 淘宝镜像并安装依赖
|
||||||
|
RUN npm config set registry https://registry.npmmirror.com && \
|
||||||
|
if [ -f yarn.lock ]; then yarn config set registry https://registry.npmmirror.com && yarn --frozen-lockfile; \
|
||||||
|
elif [ -f package-lock.json ]; then npm ci; \
|
||||||
|
elif [ -f pnpm-lock.yaml ]; then npm install -g pnpm && pnpm config set registry https://registry.npmmirror.com && pnpm i; \
|
||||||
|
else echo "Lockfile not found." && exit 1; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
##### BUILDER
|
||||||
|
|
||||||
|
FROM node:20-alpine AS builder
|
||||||
|
|
||||||
|
RUN apk add --no-cache git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
ARG NEXT_PUBLIC_API_URL="/deer-flow-backend"
|
||||||
|
|
||||||
|
# 离线模式: 复制本地源码
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG DEERFLOW_DIR=${RESOURCES_DIR}/deer-flow
|
||||||
|
|
||||||
|
COPY ${DEERFLOW_DIR} /deer-flow
|
||||||
|
|
||||||
|
RUN cd /deer-flow \
|
||||||
|
&& mv /deer-flow/web/* /app \
|
||||||
|
&& rm -rf /deer-flow
|
||||||
|
|
||||||
|
COPY --from=deps /app/node_modules ./node_modules
|
||||||
|
|
||||||
|
ENV NEXT_TELEMETRY_DISABLED=1
|
||||||
|
|
||||||
|
# 配置 npm 淘宝镜像
|
||||||
|
RUN npm config set registry https://registry.npmmirror.com && \
|
||||||
|
if [ -f yarn.lock ]; then yarn config set registry https://registry.npmmirror.com && SKIP_ENV_VALIDATION=1 yarn build; \
|
||||||
|
elif [ -f package-lock.json ]; then SKIP_ENV_VALIDATION=1 npm run build; \
|
||||||
|
elif [ -f pnpm-lock.yaml ]; then npm install -g pnpm && pnpm config set registry https://registry.npmmirror.com && SKIP_ENV_VALIDATION=1 pnpm run build; \
|
||||||
|
else echo "Lockfile not found." && exit 1; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
##### RUNNER
|
||||||
|
|
||||||
|
FROM gcr.nju.edu.cn/distroless/nodejs20-debian12 AS runner
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENV NODE_ENV=production
|
||||||
|
|
||||||
|
ENV NEXT_TELEMETRY_DISABLED=1
|
||||||
|
|
||||||
|
COPY --from=builder /app/next.config.js ./
|
||||||
|
COPY --from=builder /app/public ./public
|
||||||
|
COPY --from=builder /app/package.json ./package.json
|
||||||
|
|
||||||
|
COPY --from=builder /app/.next/standalone ./
|
||||||
|
COPY --from=builder /app/.next/static ./.next/static
|
||||||
|
|
||||||
|
EXPOSE 3000
|
||||||
|
ENV PORT=3000
|
||||||
|
|
||||||
|
CMD ["server.js"]
|
||||||
47
scripts/offline/Dockerfile.gateway.offline
Normal file
47
scripts/offline/Dockerfile.gateway.offline
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# gateway Dockerfile 离线版本
|
||||||
|
FROM maven:3-eclipse-temurin-21 AS builder
|
||||||
|
|
||||||
|
# 配置 Maven 阿里云镜像
|
||||||
|
RUN mkdir -p /root/.m2 && \
|
||||||
|
echo '<?xml version="1.0" encoding="UTF-8"?>\n\
|
||||||
|
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"\n\
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n\
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">\n\
|
||||||
|
<mirrors>\n\
|
||||||
|
<mirror>\n\
|
||||||
|
<id>aliyunmaven</id>\n\
|
||||||
|
<mirrorOf>*</mirrorOf>\n\
|
||||||
|
<name>阿里云公共仓库</name>\n\
|
||||||
|
<url>https://maven.aliyun.com/repository/public</url>\n\
|
||||||
|
</mirror>\n\
|
||||||
|
</mirrors>\n\
|
||||||
|
</settings>' > /root/.m2/settings.xml
|
||||||
|
|
||||||
|
WORKDIR /opt/gateway
|
||||||
|
|
||||||
|
COPY backend/pom.xml ./
|
||||||
|
COPY backend/api-gateway/pom.xml ./api-gateway/
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.m2/repository \
|
||||||
|
cd /opt/gateway/api-gateway && \
|
||||||
|
mvn dependency:go-offline -Dmaven.test.skip=true || true
|
||||||
|
|
||||||
|
COPY backend/api-gateway /opt/gateway/api-gateway
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.m2/repository \
|
||||||
|
cd /opt/gateway/api-gateway && \
|
||||||
|
mvn clean package -Dmaven.test.skip=true
|
||||||
|
|
||||||
|
FROM datamate-java-base:latest
|
||||||
|
|
||||||
|
COPY --from=builder /opt/gateway/api-gateway/target/gateway.jar /opt/gateway/gateway.jar
|
||||||
|
COPY scripts/images/gateway/start.sh /opt/gateway/start.sh
|
||||||
|
|
||||||
|
RUN dos2unix /opt/gateway/start.sh \
|
||||||
|
&& chmod +x /opt/gateway/start.sh \
|
||||||
|
&& ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
|
||||||
|
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/gateway/start.sh"]
|
||||||
|
CMD ["java", "-Duser.timezone=Asia/Shanghai", "-jar", "/opt/gateway/gateway.jar"]
|
||||||
54
scripts/offline/Dockerfile.runtime.offline
Normal file
54
scripts/offline/Dockerfile.runtime.offline
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# runtime Dockerfile 离线版本
|
||||||
|
# 修改点: 使用本地模型文件替代 wget 下载
|
||||||
|
|
||||||
|
FROM ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm
|
||||||
|
|
||||||
|
# 配置 apt 阿里云镜像源
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
|
--mount=type=cache,target=/var/lib/apt \
|
||||||
|
if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||||
|
elif [ -f /etc/apt/sources.list ]; then \
|
||||||
|
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||||
|
fi \
|
||||||
|
&& apt update \
|
||||||
|
&& apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix swig poppler-utils tesseract-ocr
|
||||||
|
|
||||||
|
# 离线模式: 本地模型文件路径
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG MODELS_DIR=${RESOURCES_DIR}/models
|
||||||
|
|
||||||
|
# 复制本地 PaddleOCR 模型(离线环境预先下载)
|
||||||
|
RUN mkdir -p /home/models
|
||||||
|
COPY ${MODELS_DIR}/ch_ppocr_mobile_v2.0_cls_infer.tar /home/models/
|
||||||
|
RUN tar -xf /home/models/ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models
|
||||||
|
|
||||||
|
COPY runtime/python-executor /opt/runtime
|
||||||
|
COPY runtime/ops /opt/runtime/datamate/ops
|
||||||
|
COPY runtime/ops/user /opt/runtime/user
|
||||||
|
COPY scripts/images/runtime/start.sh /opt/runtime/start.sh
|
||||||
|
|
||||||
|
ENV PYTHONPATH=/opt/runtime/datamate/
|
||||||
|
ENV UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
ENV UV_INDEX_STRATEGY=unsafe-best-match
|
||||||
|
# 配置 uv 使用阿里云 PyPI 镜像
|
||||||
|
ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/"
|
||||||
|
|
||||||
|
WORKDIR /opt/runtime
|
||||||
|
|
||||||
|
# 复制本地 spaCy 模型(离线环境预先下载)
|
||||||
|
COPY ${MODELS_DIR}/zh_core_web_sm-3.8.0-py3-none-any.whl /tmp/
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv pip install -e .[all] --system \
|
||||||
|
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
|
||||||
|
&& uv pip install /tmp/zh_core_web_sm-3.8.0-py3-none-any.whl --system \
|
||||||
|
&& echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth
|
||||||
|
|
||||||
|
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||||
|
&& chmod +x /opt/runtime/start.sh \
|
||||||
|
&& dos2unix /opt/runtime/start.sh
|
||||||
|
|
||||||
|
EXPOSE 8081
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/runtime/start.sh"]
|
||||||
42
scripts/offline/Dockerfile.runtime.offline-v2
Normal file
42
scripts/offline/Dockerfile.runtime.offline-v2
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
# runtime Dockerfile 离线版本 v2
|
||||||
|
# 使用预装 APT 包的基础镜像
|
||||||
|
|
||||||
|
FROM datamate-runtime-base:latest
|
||||||
|
|
||||||
|
# 离线模式: 本地模型文件路径
|
||||||
|
ARG RESOURCES_DIR=./build-cache/resources
|
||||||
|
ARG MODELS_DIR=${RESOURCES_DIR}/models
|
||||||
|
|
||||||
|
# 复制本地 PaddleOCR 模型
|
||||||
|
RUN mkdir -p /home/models
|
||||||
|
COPY ${MODELS_DIR}/ch_ppocr_mobile_v2.0_cls_infer.tar /home/models/
|
||||||
|
RUN tar -xf /home/models/ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models
|
||||||
|
|
||||||
|
COPY runtime/python-executor /opt/runtime
|
||||||
|
COPY runtime/ops /opt/runtime/datamate/ops
|
||||||
|
COPY runtime/ops/user /opt/runtime/user
|
||||||
|
COPY scripts/images/runtime/start.sh /opt/runtime/start.sh
|
||||||
|
|
||||||
|
ENV PYTHONPATH=/opt/runtime/datamate/
|
||||||
|
ENV UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
ENV UV_INDEX_STRATEGY=unsafe-best-match
|
||||||
|
ENV UV_INDEX_URL="https://mirrors.aliyun.com/pypi/simple/"
|
||||||
|
|
||||||
|
WORKDIR /opt/runtime
|
||||||
|
|
||||||
|
# 复制本地 spaCy 模型
|
||||||
|
COPY ${MODELS_DIR}/zh_core_web_sm-3.8.0-py3-none-any.whl /tmp/
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv pip install -e .[all] --system \
|
||||||
|
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
|
||||||
|
&& uv pip install /tmp/zh_core_web_sm-3.8.0-py3-none-any.whl --system \
|
||||||
|
&& echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth
|
||||||
|
|
||||||
|
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||||
|
&& chmod +x /opt/runtime/start.sh \
|
||||||
|
&& dos2unix /opt/runtime/start.sh
|
||||||
|
|
||||||
|
EXPOSE 8081
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/runtime/start.sh"]
|
||||||
76
scripts/offline/Makefile.offline
Normal file
76
scripts/offline/Makefile.offline
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
# Makefile 离线构建扩展
|
||||||
|
# 将此内容追加到主 Makefile 或单独使用
|
||||||
|
# 使用方法: make -f Makefile.offline <target>
|
||||||
|
|
||||||
|
# 离线构建配置
|
||||||
|
CACHE_DIR ?= ./build-cache
|
||||||
|
VERSION ?= latest
|
||||||
|
|
||||||
|
# ========== 离线构建目标 ==========
|
||||||
|
|
||||||
|
.PHONY: offline-export
|
||||||
|
offline-export:
|
||||||
|
@echo "导出离线构建缓存..."
|
||||||
|
@bash scripts/offline/export-cache.sh $(CACHE_DIR)
|
||||||
|
|
||||||
|
.PHONY: offline-build
|
||||||
|
offline-build:
|
||||||
|
@echo "使用缓存进行离线构建..."
|
||||||
|
@bash scripts/offline/build-offline.sh $(CACHE_DIR) $(VERSION)
|
||||||
|
|
||||||
|
.PHONY: offline-setup
|
||||||
|
offline-setup:
|
||||||
|
@echo "解压并设置离线缓存..."
|
||||||
|
@if [ ! -d "$(CACHE_DIR)" ]; then \
|
||||||
|
echo "查找缓存压缩包..."; \
|
||||||
|
cache_file=$$(ls -t build-cache-*.tar.gz 2>/dev/null | head -1); \
|
||||||
|
if [ -z "$$cache_file" ]; then \
|
||||||
|
echo "错误: 未找到缓存压缩包 (build-cache-*.tar.gz)"; \
|
||||||
|
exit 1; \
|
||||||
|
fi; \
|
||||||
|
echo "解压 $$cache_file..."; \
|
||||||
|
tar -xzf "$$cache_file"; \
|
||||||
|
fi
|
||||||
|
@echo "✓ 离线缓存准备完成"
|
||||||
|
|
||||||
|
# 单个服务的离线构建
|
||||||
|
.PHONY: %-offline-build
|
||||||
|
%-offline-build:
|
||||||
|
@echo "离线构建 $*..."
|
||||||
|
@$(eval CACHE_FILE := $(CACHE_DIR)/buildkit/$*-cache)
|
||||||
|
@$(eval IMAGE_NAME := $(if $(filter deer-flow%,$*),$*,datamate-$*))
|
||||||
|
@if [ ! -d "$(CACHE_FILE)" ]; then \
|
||||||
|
echo "错误: $* 的缓存不存在于 $(CACHE_FILE)"; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
@docker buildx build \
|
||||||
|
--cache-from type=local,src=$(CACHE_FILE) \
|
||||||
|
--network=none \
|
||||||
|
-f scripts/images/$*/Dockerfile \
|
||||||
|
-t $(IMAGE_NAME):$(VERSION) \
|
||||||
|
--load \
|
||||||
|
. || echo "警告: $* 离线构建失败"
|
||||||
|
|
||||||
|
# 兼容原 Makefile 的构建目标(离线模式)
|
||||||
|
.PHONY: build-offline
|
||||||
|
build-offline: offline-setup
|
||||||
|
@$(MAKE) offline-build
|
||||||
|
|
||||||
|
.PHONY: help-offline
|
||||||
|
help-offline:
|
||||||
|
@echo "离线构建命令:"
|
||||||
|
@echo " make offline-export - 在有网环境导出构建缓存"
|
||||||
|
@echo " make offline-setup - 解压并准备离线缓存"
|
||||||
|
@echo " make offline-build - 在无网环境使用缓存构建"
|
||||||
|
@echo " make <service>-offline-build - 离线构建单个服务"
|
||||||
|
@echo ""
|
||||||
|
@echo "示例:"
|
||||||
|
@echo " # 有网环境导出缓存"
|
||||||
|
@echo " make offline-export"
|
||||||
|
@echo ""
|
||||||
|
@echo " # 传输 build-cache-*.tar.gz 到无网环境"
|
||||||
|
@echo " scp build-cache-20250202.tar.gz user@offline-server:/path/"
|
||||||
|
@echo ""
|
||||||
|
@echo " # 无网环境构建"
|
||||||
|
@echo " make offline-setup"
|
||||||
|
@echo " make offline-build"
|
||||||
489
scripts/offline/README.md
Normal file
489
scripts/offline/README.md
Normal file
@@ -0,0 +1,489 @@
|
|||||||
|
# BuildKit 离线构建方案
|
||||||
|
|
||||||
|
本方案使用 Docker BuildKit 的缓存机制,实现在弱网/无网环境下的镜像构建。
|
||||||
|
|
||||||
|
## 方案概述
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ 有网环境 (Build Machine) │
|
||||||
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
|
||||||
|
│ │ 基础镜像 │ │ BuildKit │ │ 外部资源 │ │
|
||||||
|
│ │ docker pull │ + │ 缓存导出 │ + │ (模型/源码) │ │
|
||||||
|
│ └─────────────┘ └─────────────┘ └─────────────────────┘ │
|
||||||
|
│ │ │ │ │
|
||||||
|
│ └──────────────────┼──────────────────┘ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌──────────────────┐ │
|
||||||
|
│ │ build-cache.tar.gz│ │
|
||||||
|
│ └────────┬─────────┘ │
|
||||||
|
└─────────────────────────────┼───────────────────────────────────┘
|
||||||
|
│ 传输到无网环境
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ 无网环境 (Offline Machine) │
|
||||||
|
│ ┌──────────────────┐ │
|
||||||
|
│ │ build-cache.tar.gz│ │
|
||||||
|
│ └────────┬─────────┘ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
|
||||||
|
│ │ docker load │ │ BuildKit │ │ 本地资源挂载 │ │
|
||||||
|
│ │ 基础镜像 │ + │ 缓存导入 │ + │ (模型/源码) │ │
|
||||||
|
│ └─────────────┘ └─────────────┘ └─────────────────────┘ │
|
||||||
|
│ │ │ │ │
|
||||||
|
│ └──────────────────┼──────────────────┘ │
|
||||||
|
│ ▼ │
|
||||||
|
│ 构建成功! │
|
||||||
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## 快速开始
|
||||||
|
|
||||||
|
### 方法一:使用 Makefile 扩展(推荐)
|
||||||
|
|
||||||
|
#### 1. 合并 Makefile
|
||||||
|
|
||||||
|
将 `Makefile.offline.mk` 追加到主 Makefile:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Linux/Mac
|
||||||
|
cat Makefile.offline.mk >> Makefile
|
||||||
|
|
||||||
|
# Windows (PowerShell)
|
||||||
|
Get-Content Makefile.offline.mk | Add-Content Makefile
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. 有网环境导出缓存
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 导出所有缓存(包括基础镜像、BuildKit 缓存、外部资源)
|
||||||
|
make offline-export
|
||||||
|
|
||||||
|
# 或者指定输出目录
|
||||||
|
make offline-export CACHE_DIR=/path/to/cache
|
||||||
|
```
|
||||||
|
|
||||||
|
执行完成后,会生成压缩包:`build-cache-YYYYMMDD.tar.gz`
|
||||||
|
|
||||||
|
#### 3. 传输到无网环境
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 使用 scp 或其他方式传输
|
||||||
|
scp build-cache-20250202.tar.gz user@offline-server:/opt/datamate/
|
||||||
|
|
||||||
|
# 或者使用 U 盘等物理介质
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 4. 无网环境构建(推荐使用传统方式)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 解压缓存
|
||||||
|
tar -xzf build-cache-20250202.tar.gz
|
||||||
|
|
||||||
|
# 诊断环境(检查基础镜像等)
|
||||||
|
make offline-diagnose
|
||||||
|
|
||||||
|
# 方法 A:传统 docker build(推荐,更稳定)
|
||||||
|
make offline-setup
|
||||||
|
make offline-build-classic
|
||||||
|
|
||||||
|
# 方法 B:BuildKit 构建(如果方法 A 失败)
|
||||||
|
make offline-setup
|
||||||
|
make offline-build
|
||||||
|
|
||||||
|
# 或者指定版本号
|
||||||
|
make offline-build-classic OFFLINE_VERSION=v1.0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
**⚠️ 重要提示**:如果遇到镜像拉取问题,请使用 `make offline-build-classic` 而不是 `make offline-build`。
|
||||||
|
|
||||||
|
### 方法二:使用独立脚本
|
||||||
|
|
||||||
|
#### 导出缓存
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd scripts/offline
|
||||||
|
./export-cache.sh /path/to/output
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 离线构建
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd scripts/offline
|
||||||
|
./build-offline.sh /path/to/cache [version]
|
||||||
|
```
|
||||||
|
|
||||||
|
## 详细说明
|
||||||
|
|
||||||
|
### 缓存内容
|
||||||
|
|
||||||
|
缓存目录结构:
|
||||||
|
|
||||||
|
```
|
||||||
|
build-cache/
|
||||||
|
├── buildkit/ # BuildKit 缓存
|
||||||
|
│ ├── backend-cache/
|
||||||
|
│ ├── backend-python-cache/
|
||||||
|
│ ├── database-cache/
|
||||||
|
│ ├── frontend-cache/
|
||||||
|
│ ├── gateway-cache/
|
||||||
|
│ ├── runtime-cache/
|
||||||
|
│ ├── deer-flow-backend-cache/
|
||||||
|
│ ├── deer-flow-frontend-cache/
|
||||||
|
│ └── mineru-cache/
|
||||||
|
├── images/
|
||||||
|
│ └── base-images.tar # 基础镜像集合
|
||||||
|
└── resources/ # 外部资源
|
||||||
|
├── models/
|
||||||
|
│ ├── ch_ppocr_mobile_v2.0_cls_infer.tar # PaddleOCR 模型
|
||||||
|
│ └── zh_core_web_sm-3.8.0-py3-none-any.whl # spaCy 模型
|
||||||
|
├── DataX/ # DataX 源码
|
||||||
|
└── deer-flow/ # deer-flow 源码
|
||||||
|
```
|
||||||
|
|
||||||
|
### 单个服务构建
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 仅构建 backend
|
||||||
|
make backend-offline-build
|
||||||
|
|
||||||
|
# 仅构建 runtime
|
||||||
|
make runtime-offline-build
|
||||||
|
|
||||||
|
# 仅构建 deer-flow-backend
|
||||||
|
make deer-flow-backend-offline-build
|
||||||
|
```
|
||||||
|
|
||||||
|
### 增量更新
|
||||||
|
|
||||||
|
如果只有部分服务代码变更,可以只导出该服务的缓存:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 重新导出 backend 缓存
|
||||||
|
docker buildx build \
|
||||||
|
--cache-to type=local,dest=./build-cache/buildkit/backend-cache,mode=max \
|
||||||
|
-f scripts/images/backend/Dockerfile \
|
||||||
|
-t datamate-backend:cache .
|
||||||
|
|
||||||
|
# 传输并重新构建
|
||||||
|
tar -czf build-cache-partial.tar.gz build-cache/buildkit/backend-cache
|
||||||
|
# ... 传输到无网环境 ...
|
||||||
|
make backend-offline-build
|
||||||
|
```
|
||||||
|
|
||||||
|
## APT 缓存问题详解
|
||||||
|
|
||||||
|
### 问题描述
|
||||||
|
|
||||||
|
即使使用了 `--mount=type=cache,target=/var/cache/apt`,Dockerfile 中的 `apt-get update` 仍会尝试从网络获取包列表(list 数据),导致无网环境下构建失败:
|
||||||
|
|
||||||
|
```
|
||||||
|
Err:1 http://mirrors.aliyun.com/debian bookworm InRelease
|
||||||
|
Could not resolve 'mirrors.aliyun.com'
|
||||||
|
Reading package lists...
|
||||||
|
E: Failed to fetch http://mirrors.aliyun.com/debian/dists/bookworm/InRelease
|
||||||
|
```
|
||||||
|
|
||||||
|
### 根本原因
|
||||||
|
|
||||||
|
- `--mount=type=cache,target=/var/cache/apt` 只缓存下载的 `.deb` 包
|
||||||
|
- `apt-get update` 会尝试从配置的源获取最新的包索引(InRelease/Packages 文件)
|
||||||
|
- `/var/lib/apt/lists/` 目录存储包索引,但通常不在缓存范围内
|
||||||
|
|
||||||
|
### 解决方案
|
||||||
|
|
||||||
|
#### 方案 1: 使用预装 APT 包的基础镜像(推荐)
|
||||||
|
|
||||||
|
这是最有效的方法:
|
||||||
|
|
||||||
|
**步骤 1**: 在有网环境构建预装所有依赖的基础镜像
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 构建并保存带 APT 预装包的基础镜像
|
||||||
|
./scripts/offline/build-base-images.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
这会创建以下预装基础镜像:
|
||||||
|
- `datamate-java-base` - 用于 backend、gateway(预装 vim、python3、libreoffice 等)
|
||||||
|
- `datamate-python-base` - 用于 backend-python(预装 openjdk、nfs-common 等)
|
||||||
|
- `datamate-runtime-base` - 用于 runtime(预装 libgl1、tesseract-ocr 等)
|
||||||
|
- `deer-flow-backend-base` - 用于 deer-flow-backend
|
||||||
|
- `mineru-base` - 用于 mineru
|
||||||
|
|
||||||
|
**步骤 2**: 在无网环境使用这些基础镜像构建
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 加载包含预装基础镜像的 tar 包
|
||||||
|
docker load -i build-cache/images/base-images-with-apt.tar
|
||||||
|
|
||||||
|
# 使用最终版构建脚本
|
||||||
|
./scripts/offline/build-offline-final.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 方案 2: 修改 Dockerfile 跳过 apt update
|
||||||
|
|
||||||
|
如果确定不需要安装新包,可以修改 Dockerfile:
|
||||||
|
|
||||||
|
```dockerfile
|
||||||
|
# 原代码
|
||||||
|
RUN apt-get update && apt-get install -y xxx
|
||||||
|
|
||||||
|
# 修改为(离线环境)
|
||||||
|
# RUN apt-get update && \
|
||||||
|
RUN apt-get install -y xxx || true
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 方案 3: 挂载 apt lists 缓存
|
||||||
|
|
||||||
|
在有网环境预先下载并保存 apt lists:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 有网环境:保存 apt lists
|
||||||
|
docker run --rm \
|
||||||
|
-v "$(pwd)/apt-lists:/var/lib/apt/lists" \
|
||||||
|
eclipse-temurin:21-jdk \
|
||||||
|
apt-get update
|
||||||
|
|
||||||
|
# 无网环境:挂载保存的 lists
|
||||||
|
docker build \
|
||||||
|
--mount=type=bind,source=$(pwd)/apt-lists,target=/var/lib/apt/lists,ro \
|
||||||
|
-f Dockerfile .
|
||||||
|
```
|
||||||
|
|
||||||
|
**注意**: BuildKit 的 `--mount=type=bind` 在 `docker build` 中不直接支持,需要在 Dockerfile 中使用。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 故障排查
|
||||||
|
|
||||||
|
### 问题 1: 构建时仍然尝试拉取镜像(最常见)
|
||||||
|
|
||||||
|
**现象**:
|
||||||
|
```
|
||||||
|
ERROR: failed to solve: pulling from host ...
|
||||||
|
或
|
||||||
|
ERROR: pull access denied, repository does not exist or may require authorization
|
||||||
|
```
|
||||||
|
|
||||||
|
**原因**:
|
||||||
|
- 基础镜像未正确加载
|
||||||
|
- BuildKit 尝试验证远程镜像
|
||||||
|
|
||||||
|
**解决方案**:
|
||||||
|
|
||||||
|
1. **使用传统构建方式(推荐)**:
|
||||||
|
```bash
|
||||||
|
make offline-build-classic
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **手动加载基础镜像**:
|
||||||
|
```bash
|
||||||
|
# 加载基础镜像
|
||||||
|
docker load -i build-cache/images/base-images.tar
|
||||||
|
|
||||||
|
# 验证镜像存在
|
||||||
|
docker images | grep -E "(maven|eclipse-temurin|mysql|node|nginx)"
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **使用 Docker 守护进程的离线模式**:
|
||||||
|
```bash
|
||||||
|
# 编辑 /etc/docker/daemon.json
|
||||||
|
{
|
||||||
|
"registry-mirrors": [],
|
||||||
|
"insecure-registries": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# 重启 Docker
|
||||||
|
sudo systemctl restart docker
|
||||||
|
```
|
||||||
|
|
||||||
|
### 问题 2: 缓存导入失败
|
||||||
|
|
||||||
|
```
|
||||||
|
ERROR: failed to solve: failed to read cache metadata
|
||||||
|
```
|
||||||
|
|
||||||
|
**解决**: 缓存目录可能损坏,重新在有网环境导出。
|
||||||
|
|
||||||
|
### 问题 3: 基础镜像不存在
|
||||||
|
|
||||||
|
```
|
||||||
|
ERROR: pull access denied
|
||||||
|
```
|
||||||
|
|
||||||
|
**解决**:
|
||||||
|
1. 先执行 `make offline-setup` 加载基础镜像
|
||||||
|
2. 运行 `make offline-diagnose` 检查缺失的镜像
|
||||||
|
3. 重新导出缓存时确保包含所有基础镜像
|
||||||
|
|
||||||
|
### 问题 4: 网络连接错误(无网环境)
|
||||||
|
|
||||||
|
```
|
||||||
|
ERROR: failed to do request: dial tcp: lookup ...
|
||||||
|
```
|
||||||
|
|
||||||
|
**解决**: 检查 Dockerfile 中是否还有网络依赖(如 `git clone`、`wget`、`pip install` 等),可能需要修改 Dockerfile 使用本地资源。
|
||||||
|
|
||||||
|
### 问题 5: 内存不足
|
||||||
|
|
||||||
|
BuildKit 缓存可能占用大量内存,可以设置资源限制:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 创建带资源限制的 buildx 构建器
|
||||||
|
docker buildx create --name offline-builder \
|
||||||
|
--driver docker-container \
|
||||||
|
--driver-opt memory=8g \
|
||||||
|
--use
|
||||||
|
```
|
||||||
|
|
||||||
|
### 问题 6: BuildKit 构建器无法使用本地镜像
|
||||||
|
|
||||||
|
**现象**: 即使镜像已加载,BuildKit 仍提示找不到镜像
|
||||||
|
|
||||||
|
**解决**: BuildKit 的 `docker-container` 驱动无法直接访问本地镜像。使用以下方法之一:
|
||||||
|
|
||||||
|
**方法 A**: 使用传统 Docker 构建(推荐)
|
||||||
|
```bash
|
||||||
|
make offline-build-classic
|
||||||
|
```
|
||||||
|
|
||||||
|
**方法 B**: 将镜像推送到本地 registry
|
||||||
|
```bash
|
||||||
|
# 启动本地 registry
|
||||||
|
docker run -d -p 5000:5000 --name registry registry:2
|
||||||
|
|
||||||
|
# 标记并推送镜像到本地 registry
|
||||||
|
docker tag maven:3-eclipse-temurin-21 localhost:5000/maven:3-eclipse-temurin-21
|
||||||
|
docker push localhost:5000/maven:3-eclipse-temurin-21
|
||||||
|
|
||||||
|
# 修改 Dockerfile 使用本地 registry
|
||||||
|
# FROM localhost:5000/maven:3-eclipse-temurin-21
|
||||||
|
```
|
||||||
|
|
||||||
|
**方法 C**: 使用 `docker` 驱动的 buildx 构建器(不需要推送镜像,但有其他限制)
|
||||||
|
```bash
|
||||||
|
# 创建使用 docker 驱动的构建器
|
||||||
|
docker buildx create --name offline-builder --driver docker --use
|
||||||
|
|
||||||
|
# 但这种方式无法使用 --cache-from type=local
|
||||||
|
# 仅适用于简单的离线构建场景
|
||||||
|
```
|
||||||
|
|
||||||
|
## 限制说明
|
||||||
|
|
||||||
|
1. **镜像版本**: 基础镜像版本必须与缓存导出时一致
|
||||||
|
2. **Dockerfile 变更**: 如果 Dockerfile 发生较大变更,可能需要重新导出缓存
|
||||||
|
3. **资源文件**: mineru 镜像中的模型下载(`mineru-models-download`)仍需要网络,如果需要在完全无网环境使用,需要预先将模型文件挂载到镜像中
|
||||||
|
|
||||||
|
## 高级用法
|
||||||
|
|
||||||
|
### 自定义缓存位置
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make offline-export CACHE_DIR=/mnt/nas/build-cache
|
||||||
|
make offline-build CACHE_DIR=/mnt/nas/build-cache
|
||||||
|
```
|
||||||
|
|
||||||
|
### 导出特定平台缓存
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 导出 ARM64 平台的缓存
|
||||||
|
docker buildx build \
|
||||||
|
--platform linux/arm64 \
|
||||||
|
--cache-to type=local,dest=./build-cache/buildkit/backend-cache,mode=max \
|
||||||
|
-f scripts/images/backend/Dockerfile .
|
||||||
|
```
|
||||||
|
|
||||||
|
### 使用远程缓存(有网环境)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 导出到 S3/MinIO
|
||||||
|
docker buildx build \
|
||||||
|
--cache-to type=s3,region=us-east-1,bucket=mybucket,name=backend-cache \
|
||||||
|
-f scripts/images/backend/Dockerfile .
|
||||||
|
|
||||||
|
# 从 S3 导入
|
||||||
|
docker buildx build \
|
||||||
|
--cache-from type=s3,region=us-east-1,bucket=mybucket,name=backend-cache \
|
||||||
|
-f scripts/images/backend/Dockerfile .
|
||||||
|
```
|
||||||
|
|
||||||
|
## 文件清单
|
||||||
|
|
||||||
|
```
|
||||||
|
scripts/offline/
|
||||||
|
├── export-cache.sh # 有网环境导出缓存脚本
|
||||||
|
├── build-base-images.sh # 构建 APT 预装基础镜像
|
||||||
|
├── build-offline.sh # 基础离线构建脚本(BuildKit)
|
||||||
|
├── build-offline-v2.sh # 增强版离线构建脚本
|
||||||
|
├── build-offline-classic.sh # 传统 docker build 脚本
|
||||||
|
├── build-offline-final.sh # 最终版(使用预装基础镜像,推荐)
|
||||||
|
├── diagnose.sh # 环境诊断脚本
|
||||||
|
├── Dockerfile.base-images # 预装 APT 包的基础镜像定义
|
||||||
|
├── Dockerfile.backend.offline # backend 离线 Dockerfile(使用预装基础镜像)
|
||||||
|
├── Dockerfile.gateway.offline # gateway 离线 Dockerfile(使用预装基础镜像)
|
||||||
|
├── Dockerfile.backend-python.offline # backend-python 离线 Dockerfile
|
||||||
|
├── Dockerfile.backend-python.offline-v2 # backend-python 离线 Dockerfile v2(使用预装基础镜像)
|
||||||
|
├── Dockerfile.runtime.offline # runtime 离线 Dockerfile
|
||||||
|
├── Dockerfile.runtime.offline-v2 # runtime 离线 Dockerfile v2(使用预装基础镜像)
|
||||||
|
├── Dockerfile.deer-flow-backend.offline # deer-flow-backend 离线 Dockerfile
|
||||||
|
├── Dockerfile.deer-flow-frontend.offline # deer-flow-frontend 离线 Dockerfile
|
||||||
|
├── Makefile.offline # 独立离线构建 Makefile
|
||||||
|
└── README.md # 本文档
|
||||||
|
|
||||||
|
Makefile.offline.mk # Makefile 扩展(追加到主 Makefile)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 推荐工作流(解决 APT 问题版)
|
||||||
|
|
||||||
|
### 工作流 A: 使用预装 APT 包的基础镜像(彻底解决 APT 问题)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ========== 有网环境 ==========
|
||||||
|
|
||||||
|
# 1. 构建并保存带 APT 预装包的基础镜像
|
||||||
|
./scripts/offline/build-base-images.sh
|
||||||
|
# 输出: build-cache/images/base-images-with-apt.tar
|
||||||
|
|
||||||
|
# 2. 导出其他缓存(BuildKit 缓存、外部资源)
|
||||||
|
./scripts/offline/export-cache.sh
|
||||||
|
|
||||||
|
# 3. 打包传输
|
||||||
|
scp build-cache/images/base-images-with-apt.tar user@offline-server:/opt/datamate/build-cache/images/
|
||||||
|
scp build-cache-*.tar.gz user@offline-server:/opt/datamate/
|
||||||
|
|
||||||
|
# ========== 无网环境 ==========
|
||||||
|
|
||||||
|
cd /opt/datamate
|
||||||
|
|
||||||
|
# 4. 解压
|
||||||
|
tar -xzf build-cache-*.tar.gz
|
||||||
|
|
||||||
|
# 5. 加载预装基础镜像(关键!)
|
||||||
|
docker load -i build-cache/images/base-images-with-apt.tar
|
||||||
|
|
||||||
|
# 6. 使用最终版脚本构建
|
||||||
|
./scripts/offline/build-offline-final.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### 工作流 B: 简单场景(使用传统构建)
|
||||||
|
|
||||||
|
如果 APT 包需求简单,可以直接使用传统构建:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 有网环境
|
||||||
|
make offline-export
|
||||||
|
|
||||||
|
# 传输到无网环境
|
||||||
|
scp build-cache-*.tar.gz offline-server:/path/
|
||||||
|
|
||||||
|
# 无网环境
|
||||||
|
tar -xzf build-cache-*.tar.gz
|
||||||
|
make offline-diagnose # 检查环境
|
||||||
|
make offline-build-classic # 使用传统构建
|
||||||
|
```
|
||||||
|
|
||||||
|
## 参考
|
||||||
|
|
||||||
|
- [Docker BuildKit Documentation](https://docs.docker.com/build/buildkit/)
|
||||||
|
- [Cache Storage Backends](https://docs.docker.com/build/cache/backends/)
|
||||||
87
scripts/offline/build-base-images.sh
Normal file
87
scripts/offline/build-base-images.sh
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# 构建带有预装 APT 包的基础镜像
|
||||||
|
# Usage: ./build-base-images.sh [output-dir]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
OUTPUT_DIR="${1:-./build-cache}"
|
||||||
|
IMAGES_DIR="$OUTPUT_DIR/images"
|
||||||
|
|
||||||
|
mkdir -p "$IMAGES_DIR"
|
||||||
|
|
||||||
|
echo "======================================"
|
||||||
|
echo "构建预装 APT 包的基础镜像"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
# 构建各个基础镜像
|
||||||
|
echo ""
|
||||||
|
echo "1. 构建 datamate-java-base (用于 backend, gateway)..."
|
||||||
|
docker build \
|
||||||
|
-t datamate-java-base:latest \
|
||||||
|
--target datamate-java-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: datamate-java-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "2. 构建 datamate-python-base (用于 backend-python)..."
|
||||||
|
docker build \
|
||||||
|
-t datamate-python-base:latest \
|
||||||
|
--target datamate-python-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: datamate-python-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "3. 构建 datamate-runtime-base (用于 runtime)..."
|
||||||
|
docker build \
|
||||||
|
-t datamate-runtime-base:latest \
|
||||||
|
--target datamate-runtime-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: datamate-runtime-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "4. 构建 deer-flow-backend-base (用于 deer-flow-backend)..."
|
||||||
|
docker build \
|
||||||
|
-t deer-flow-backend-base:latest \
|
||||||
|
--target deer-flow-backend-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: deer-flow-backend-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "5. 构建 mineru-base (用于 mineru)..."
|
||||||
|
docker build \
|
||||||
|
-t mineru-base:latest \
|
||||||
|
--target mineru-base \
|
||||||
|
-f scripts/offline/Dockerfile.base-images \
|
||||||
|
. || echo "Warning: mineru-base 构建失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "保存基础镜像集合"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
docker save -o "$IMAGES_DIR/base-images-with-apt.tar" \
|
||||||
|
maven:3-eclipse-temurin-21 \
|
||||||
|
maven:3-eclipse-temurin-8 \
|
||||||
|
eclipse-temurin:21-jdk \
|
||||||
|
mysql:8 \
|
||||||
|
node:20-alpine \
|
||||||
|
nginx:1.29 \
|
||||||
|
ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm \
|
||||||
|
ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm \
|
||||||
|
ghcr.nju.edu.cn/astral-sh/uv:latest \
|
||||||
|
python:3.12-slim \
|
||||||
|
python:3.11-slim \
|
||||||
|
gcr.io/distroless/nodejs20-debian12 \
|
||||||
|
datamate-java-base:latest \
|
||||||
|
datamate-python-base:latest \
|
||||||
|
datamate-runtime-base:latest \
|
||||||
|
deer-flow-backend-base:latest \
|
||||||
|
mineru-base:latest \
|
||||||
|
2>/dev/null || echo "Warning: 部分镜像保存失败"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "✓ 基础镜像构建完成"
|
||||||
|
echo "======================================"
|
||||||
|
echo "镜像列表:"
|
||||||
|
docker images | grep -E "(datamate-|deer-flow-|mineru-)base" || true
|
||||||
206
scripts/offline/build-offline-classic.sh
Normal file
206
scripts/offline/build-offline-classic.sh
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# 传统 docker build 离线构建脚本(不使用 buildx)
|
||||||
|
# 这种方式更稳定,兼容性更好
|
||||||
|
# Usage: ./build-offline-classic.sh [cache-dir] [version]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CACHE_DIR="${1:-./build-cache}"
|
||||||
|
VERSION="${2:-latest}"
|
||||||
|
IMAGES_DIR="$CACHE_DIR/images"
|
||||||
|
RESOURCES_DIR="$CACHE_DIR/resources"
|
||||||
|
|
||||||
|
# 颜色输出
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
|
||||||
|
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
||||||
|
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||||
|
log_debug() { echo -e "${BLUE}[DEBUG]${NC} $1"; }
|
||||||
|
|
||||||
|
# 检查缓存目录
|
||||||
|
if [ ! -d "$CACHE_DIR" ]; then
|
||||||
|
log_error "缓存目录 $CACHE_DIR 不存在"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 加载基础镜像
|
||||||
|
load_base_images() {
|
||||||
|
log_info "加载基础镜像..."
|
||||||
|
|
||||||
|
if [ ! -f "$IMAGES_DIR/base-images.tar" ]; then
|
||||||
|
log_warn "基础镜像 tar 包不存在,检查本地镜像..."
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "从 $IMAGES_DIR/base-images.tar 加载..."
|
||||||
|
docker load -i "$IMAGES_DIR/base-images.tar"
|
||||||
|
log_info "✓ 基础镜像加载完成"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 检查镜像是否存在
|
||||||
|
check_image() {
|
||||||
|
docker inspect "$1" > /dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
# 构建函数
|
||||||
|
build_service() {
|
||||||
|
local service_name=$1
|
||||||
|
local image_name=$2
|
||||||
|
local dockerfile=$3
|
||||||
|
|
||||||
|
log_info "----------------------------------------"
|
||||||
|
log_info "构建 $service_name"
|
||||||
|
log_info "----------------------------------------"
|
||||||
|
|
||||||
|
# 检查 Dockerfile 是否存在
|
||||||
|
if [ ! -f "$dockerfile" ]; then
|
||||||
|
log_error "Dockerfile 不存在: $dockerfile"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 获取所需的基础镜像
|
||||||
|
local from_images
|
||||||
|
from_images=$(grep -E '^FROM' "$dockerfile" | sed 's/FROM //' | sed 's/ AS .*//' | sed 's/ as .*//' | awk '{print $1}' | sort -u)
|
||||||
|
|
||||||
|
log_info "检查基础镜像..."
|
||||||
|
local all_exist=true
|
||||||
|
for img in $from_images; do
|
||||||
|
# 跳过多阶段构建的中间阶段引用
|
||||||
|
if [[ "$img" == --from=* ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if check_image "$img"; then
|
||||||
|
log_info " ✓ $img"
|
||||||
|
else
|
||||||
|
log_error " ✗ $img (缺失)"
|
||||||
|
all_exist=false
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "$all_exist" = false ]; then
|
||||||
|
log_error "缺少必要的基础镜像,无法构建 $service_name"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 准备构建参数
|
||||||
|
local build_args=()
|
||||||
|
|
||||||
|
# 根据服务类型添加特殊处理
|
||||||
|
case "$service_name" in
|
||||||
|
runtime)
|
||||||
|
# runtime 需要模型文件
|
||||||
|
if [ -d "$RESOURCES_DIR/models" ]; then
|
||||||
|
log_info "使用本地模型文件"
|
||||||
|
build_args+=("--build-arg" "RESOURCES_DIR=$RESOURCES_DIR")
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
backend-python)
|
||||||
|
if [ -d "$RESOURCES_DIR/DataX" ]; then
|
||||||
|
log_info "使用本地 DataX 源码"
|
||||||
|
build_args+=("--build-arg" "RESOURCES_DIR=$RESOURCES_DIR")
|
||||||
|
build_args+=("--build-arg" "DATAX_LOCAL_PATH=$RESOURCES_DIR/DataX")
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
deer-flow-backend|deer-flow-frontend)
|
||||||
|
if [ -d "$RESOURCES_DIR/deer-flow" ]; then
|
||||||
|
log_info "使用本地 deer-flow 源码"
|
||||||
|
build_args+=("--build-arg" "RESOURCES_DIR=$RESOURCES_DIR")
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# 执行构建
|
||||||
|
log_info "开始构建..."
|
||||||
|
if docker build \
|
||||||
|
--pull=false \
|
||||||
|
"${build_args[@]}" \
|
||||||
|
-f "$dockerfile" \
|
||||||
|
-t "$image_name:$VERSION" \
|
||||||
|
. 2>&1; then
|
||||||
|
log_info "✓ $service_name 构建成功"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
log_error "✗ $service_name 构建失败"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# 主流程
|
||||||
|
main() {
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "传统 Docker 离线构建"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
# 加载基础镜像
|
||||||
|
load_base_images
|
||||||
|
|
||||||
|
# 定义要构建的服务
|
||||||
|
declare -A SERVICES=(
|
||||||
|
["database"]="datamate-database:scripts/images/database/Dockerfile"
|
||||||
|
["gateway"]="datamate-gateway:scripts/images/gateway/Dockerfile"
|
||||||
|
["backend"]="datamate-backend:scripts/images/backend/Dockerfile"
|
||||||
|
["frontend"]="datamate-frontend:scripts/images/frontend/Dockerfile"
|
||||||
|
["runtime"]="datamate-runtime:scripts/images/runtime/Dockerfile"
|
||||||
|
["backend-python"]="datamate-backend-python:scripts/images/backend-python/Dockerfile"
|
||||||
|
)
|
||||||
|
|
||||||
|
# deer-flow 和 mineru 是可选的
|
||||||
|
OPTIONAL_SERVICES=(
|
||||||
|
"deer-flow-backend:deer-flow-backend:scripts/images/deer-flow-backend/Dockerfile"
|
||||||
|
"deer-flow-frontend:deer-flow-frontend:scripts/images/deer-flow-frontend/Dockerfile"
|
||||||
|
"mineru:datamate-mineru:scripts/images/mineru/Dockerfile"
|
||||||
|
)
|
||||||
|
|
||||||
|
log_info ""
|
||||||
|
log_info "构建核心服务..."
|
||||||
|
local failed=()
|
||||||
|
local succeeded=()
|
||||||
|
|
||||||
|
for service_name in "${!SERVICES[@]}"; do
|
||||||
|
IFS=':' read -r image_name dockerfile <<< "${SERVICES[$service_name]}"
|
||||||
|
if build_service "$service_name" "$image_name" "$dockerfile"; then
|
||||||
|
succeeded+=("$service_name")
|
||||||
|
else
|
||||||
|
failed+=("$service_name")
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
# 尝试构建可选服务
|
||||||
|
log_info "构建可选服务..."
|
||||||
|
for service_config in "${OPTIONAL_SERVICES[@]}"; do
|
||||||
|
IFS=':' read -r service_name image_name dockerfile <<< "$service_config"
|
||||||
|
if build_service "$service_name" "$image_name" "$dockerfile"; then
|
||||||
|
succeeded+=("$service_name")
|
||||||
|
else
|
||||||
|
log_warn "$service_name 构建失败(可选服务,继续)"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
# 汇总
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "构建结果"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
if [ ${#succeeded[@]} -gt 0 ]; then
|
||||||
|
log_info "成功 (${#succeeded[@]}): ${succeeded[*]}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${#failed[@]} -gt 0 ]; then
|
||||||
|
log_error "失败 (${#failed[@]}): ${failed[*]}"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
log_info "✓ 所有核心服务构建成功!"
|
||||||
|
echo ""
|
||||||
|
docker images --format "table {{.Repository}}:{{.Tag}}\t{{.Size}}" | grep -E "(datamate-|deer-flow-)" || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
181
scripts/offline/build-offline-final.sh
Normal file
181
scripts/offline/build-offline-final.sh
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# 最终版离线构建脚本 - 使用预装 APT 包的基础镜像
|
||||||
|
# Usage: ./build-offline-final.sh [cache-dir] [version]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CACHE_DIR="${1:-./build-cache}"
|
||||||
|
VERSION="${2:-latest}"
|
||||||
|
IMAGES_DIR="$CACHE_DIR/images"
|
||||||
|
RESOURCES_DIR="$CACHE_DIR/resources"
|
||||||
|
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
|
||||||
|
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
||||||
|
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||||
|
|
||||||
|
# 检查缓存目录
|
||||||
|
if [ ! -d "$CACHE_DIR" ]; then
|
||||||
|
log_error "缓存目录 $CACHE_DIR 不存在"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 加载基础镜像
|
||||||
|
load_images() {
|
||||||
|
log_info "加载基础镜像..."
|
||||||
|
|
||||||
|
# 优先加载带 APT 预装包的镜像集合
|
||||||
|
if [ -f "$IMAGES_DIR/base-images-with-apt.tar" ]; then
|
||||||
|
log_info "加载带 APT 预装包的基础镜像..."
|
||||||
|
docker load -i "$IMAGES_DIR/base-images-with-apt.tar"
|
||||||
|
elif [ -f "$IMAGES_DIR/base-images.tar" ]; then
|
||||||
|
log_warn "加载普通基础镜像(不含 APT 预装包)..."
|
||||||
|
docker load -i "$IMAGES_DIR/base-images.tar"
|
||||||
|
else
|
||||||
|
log_warn "基础镜像 tar 包不存在,检查本地镜像..."
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "✓ 镜像加载完成"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 验证镜像是否存在
|
||||||
|
verify_image() {
|
||||||
|
docker inspect "$1" > /dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
# 构建函数
|
||||||
|
build_service() {
|
||||||
|
local service_name=$1
|
||||||
|
local image_name=$2
|
||||||
|
local dockerfile=$3
|
||||||
|
local base_image=$4 # 必需的基础镜像
|
||||||
|
|
||||||
|
log_info "----------------------------------------"
|
||||||
|
log_info "构建 $service_name"
|
||||||
|
log_info "----------------------------------------"
|
||||||
|
|
||||||
|
if [ ! -f "$dockerfile" ]; then
|
||||||
|
log_error "Dockerfile 不存在: $dockerfile"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 检查必需的基础镜像
|
||||||
|
if [ -n "$base_image" ]; then
|
||||||
|
if verify_image "$base_image"; then
|
||||||
|
log_info "✓ 基础镜像存在: $base_image"
|
||||||
|
else
|
||||||
|
log_error "✗ 缺少基础镜像: $base_image"
|
||||||
|
log_info "请确保已加载正确的 base-images-with-apt.tar"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 准备构建参数
|
||||||
|
local build_args=()
|
||||||
|
|
||||||
|
# 添加资源目录参数
|
||||||
|
if [ -d "$RESOURCES_DIR" ]; then
|
||||||
|
build_args+=("--build-arg" "RESOURCES_DIR=$RESOURCES_DIR")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 执行构建
|
||||||
|
log_info "开始构建..."
|
||||||
|
if docker build \
|
||||||
|
--pull=false \
|
||||||
|
"${build_args[@]}" \
|
||||||
|
-f "$dockerfile" \
|
||||||
|
-t "$image_name:$VERSION" \
|
||||||
|
. 2>&1; then
|
||||||
|
log_info "✓ $service_name 构建成功"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
log_error "✗ $service_name 构建失败"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# 主流程
|
||||||
|
main() {
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "最终版离线构建 (使用 APT 预装基础镜像)"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
# 加载基础镜像
|
||||||
|
load_images
|
||||||
|
|
||||||
|
# 验证关键基础镜像
|
||||||
|
log_info ""
|
||||||
|
log_info "验证预装基础镜像..."
|
||||||
|
REQUIRED_BASE_IMAGES=(
|
||||||
|
"datamate-java-base:latest"
|
||||||
|
"datamate-python-base:latest"
|
||||||
|
"datamate-runtime-base:latest"
|
||||||
|
)
|
||||||
|
|
||||||
|
for img in "${REQUIRED_BASE_IMAGES[@]}"; do
|
||||||
|
if verify_image "$img"; then
|
||||||
|
log_info " ✓ $img"
|
||||||
|
else
|
||||||
|
log_warn " ✗ $img (缺失)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# 定义服务配置
|
||||||
|
declare -A SERVICES=(
|
||||||
|
["database"]="datamate-database:scripts/images/database/Dockerfile:"
|
||||||
|
["gateway"]="datamate-gateway:scripts/offline/Dockerfile.gateway.offline:datamate-java-base:latest"
|
||||||
|
["backend"]="datamate-backend:scripts/offline/Dockerfile.backend.offline:datamate-java-base:latest"
|
||||||
|
["frontend"]="datamate-frontend:scripts/images/frontend/Dockerfile:"
|
||||||
|
["runtime"]="datamate-runtime:scripts/offline/Dockerfile.runtime.offline-v2:datamate-runtime-base:latest"
|
||||||
|
["backend-python"]="datamate-backend-python:scripts/offline/Dockerfile.backend-python.offline-v2:datamate-python-base:latest"
|
||||||
|
)
|
||||||
|
|
||||||
|
log_info ""
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "开始构建服务"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
local failed=()
|
||||||
|
local succeeded=()
|
||||||
|
|
||||||
|
for service_name in "${!SERVICES[@]}"; do
|
||||||
|
IFS=':' read -r image_name dockerfile base_image <<< "${SERVICES[$service_name]}"
|
||||||
|
if build_service "$service_name" "$image_name" "$dockerfile" "$base_image"; then
|
||||||
|
succeeded+=("$service_name")
|
||||||
|
else
|
||||||
|
failed+=("$service_name")
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
# 汇总
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "构建结果"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
if [ ${#succeeded[@]} -gt 0 ]; then
|
||||||
|
log_info "成功 (${#succeeded[@]}): ${succeeded[*]}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${#failed[@]} -gt 0 ]; then
|
||||||
|
log_error "失败 (${#failed[@]}): ${failed[*]}"
|
||||||
|
|
||||||
|
log_info ""
|
||||||
|
log_info "提示: 如果失败是因为缺少预装基础镜像,请确保:"
|
||||||
|
log_info " 1. 在有网环境执行: ./scripts/offline/build-base-images.sh"
|
||||||
|
log_info " 2. 将生成的 base-images-with-apt.tar 传输到无网环境"
|
||||||
|
log_info " 3. 在无网环境加载: docker load -i base-images-with-apt.tar"
|
||||||
|
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
log_info "✓ 所有服务构建成功!"
|
||||||
|
echo ""
|
||||||
|
docker images --format "table {{.Repository}}:{{.Tag}}\t{{.Size}}" | grep -E "(datamate-|deer-flow-)" || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
249
scripts/offline/build-offline-v2.sh
Normal file
249
scripts/offline/build-offline-v2.sh
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# BuildKit 离线构建脚本 v2 - 增强版
|
||||||
|
# Usage: ./build-offline-v2.sh [cache-dir] [version]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CACHE_DIR="${1:-./build-cache}"
|
||||||
|
VERSION="${2:-latest}"
|
||||||
|
BUILDKIT_CACHE_DIR="$CACHE_DIR/buildkit"
|
||||||
|
IMAGES_DIR="$CACHE_DIR/images"
|
||||||
|
RESOURCES_DIR="$CACHE_DIR/resources"
|
||||||
|
|
||||||
|
# 颜色输出
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
log_info() {
|
||||||
|
echo -e "${GREEN}[INFO]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warn() {
|
||||||
|
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 检查缓存目录
|
||||||
|
if [ ! -d "$CACHE_DIR" ]; then
|
||||||
|
log_error "缓存目录 $CACHE_DIR 不存在"
|
||||||
|
log_info "请先解压缓存包: tar -xzf build-cache-*.tar.gz"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 确保 buildx 构建器存在(使用 docker 驱动以支持本地镜像)
|
||||||
|
setup_buildx() {
|
||||||
|
log_info "设置 BuildKit 构建器..."
|
||||||
|
|
||||||
|
# 删除旧的构建器
|
||||||
|
if docker buildx inspect offline-builder > /dev/null 2>&1; then
|
||||||
|
docker buildx rm offline-builder 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 创建新的构建器,使用 docker 驱动(支持本地镜像,不需要推送到 registry)
|
||||||
|
docker buildx create --name offline-builder \
|
||||||
|
--driver docker-container \
|
||||||
|
--driver-opt image=moby/buildkit:buildx-stable-1 \
|
||||||
|
--use
|
||||||
|
|
||||||
|
log_info "BuildKit 构建器创建完成"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 加载基础镜像
|
||||||
|
load_base_images() {
|
||||||
|
log_info "加载基础镜像..."
|
||||||
|
|
||||||
|
if [ ! -f "$IMAGES_DIR/base-images.tar" ]; then
|
||||||
|
log_warn "基础镜像文件不存在: $IMAGES_DIR/base-images.tar"
|
||||||
|
log_info "检查本地是否存在所需镜像..."
|
||||||
|
|
||||||
|
# 检查关键镜像是否存在
|
||||||
|
required_images=(
|
||||||
|
"maven:3-eclipse-temurin-21"
|
||||||
|
"eclipse-temurin:21-jdk"
|
||||||
|
"mysql:8"
|
||||||
|
"node:20-alpine"
|
||||||
|
"nginx:1.29"
|
||||||
|
)
|
||||||
|
|
||||||
|
for img in "${required_images[@]}"; do
|
||||||
|
if ! docker inspect "$img" > /dev/null 2>&1; then
|
||||||
|
log_error "缺少基础镜像: $img"
|
||||||
|
log_info "请确保基础镜像已加载: docker load -i base-images.tar"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
log_info "本地基础镜像检查通过"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "从 $IMAGES_DIR/base-images.tar 加载基础镜像..."
|
||||||
|
docker load -i "$IMAGES_DIR/base-images.tar"
|
||||||
|
log_info "基础镜像加载完成"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 验证镜像是否存在
|
||||||
|
verify_image() {
|
||||||
|
local image_name=$1
|
||||||
|
if docker inspect "$image_name" > /dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# 离线构建函数
|
||||||
|
offline_build() {
|
||||||
|
local service_name=$1
|
||||||
|
local image_name=$2
|
||||||
|
local dockerfile=$3
|
||||||
|
local cache_file="$BUILDKIT_CACHE_DIR/${service_name}-cache"
|
||||||
|
|
||||||
|
log_info "----------------------------------------"
|
||||||
|
log_info "构建 [$service_name] -> $image_name:$VERSION"
|
||||||
|
log_info "----------------------------------------"
|
||||||
|
|
||||||
|
if [ ! -d "$cache_file" ]; then
|
||||||
|
log_warn "$service_name 的缓存不存在,跳过..."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 获取 Dockerfile 中的基础镜像
|
||||||
|
local base_images
|
||||||
|
base_images=$(grep -E '^FROM' "$dockerfile" | awk '{print $2}' | sort -u)
|
||||||
|
|
||||||
|
log_info "检查基础镜像..."
|
||||||
|
for base_img in $base_images; do
|
||||||
|
# 跳过多阶段构建中的 AS 别名
|
||||||
|
base_img=$(echo "$base_img" | cut -d':' -f1-2 | sed 's/AS.*//i' | tr -d ' ')
|
||||||
|
|
||||||
|
if [ -z "$base_img" ] || [[ "$base_img" == *"AS"* ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if verify_image "$base_img"; then
|
||||||
|
log_info " ✓ $base_img"
|
||||||
|
else
|
||||||
|
log_warn " ✗ $base_img (未找到)"
|
||||||
|
# 尝试从 base-images.tar 中加载
|
||||||
|
if [ -f "$IMAGES_DIR/base-images.tar" ]; then
|
||||||
|
log_info " 尝试从 tar 包加载..."
|
||||||
|
docker load -i "$IMAGES_DIR/base-images.tar" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# 执行离线构建
|
||||||
|
log_info "开始构建..."
|
||||||
|
|
||||||
|
# 构建参数
|
||||||
|
local build_args=()
|
||||||
|
|
||||||
|
# 为需要外部资源的服务添加 build-arg
|
||||||
|
case "$service_name" in
|
||||||
|
runtime|deer-flow-backend|deer-flow-frontend)
|
||||||
|
if [ -d "$RESOURCES_DIR" ]; then
|
||||||
|
build_args+=("--build-arg" "RESOURCES_DIR=$RESOURCES_DIR")
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
backend-python)
|
||||||
|
if [ -d "$RESOURCES_DIR" ]; then
|
||||||
|
build_args+=("--build-arg" "RESOURCES_DIR=$RESOURCES_DIR")
|
||||||
|
build_args+=("--build-arg" "DATAX_LOCAL_PATH=$RESOURCES_DIR/DataX")
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# 执行构建
|
||||||
|
if docker buildx build \
|
||||||
|
--builder offline-builder \
|
||||||
|
--cache-from "type=local,src=$cache_file" \
|
||||||
|
--pull=false \
|
||||||
|
--output "type=docker" \
|
||||||
|
"${build_args[@]}" \
|
||||||
|
-f "$dockerfile" \
|
||||||
|
-t "$image_name:$VERSION" \
|
||||||
|
. 2>&1; then
|
||||||
|
|
||||||
|
log_info "✓ $service_name 构建成功"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
log_error "✗ $service_name 构建失败"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# 主流程
|
||||||
|
main() {
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "BuildKit 离线构建"
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "缓存目录: $CACHE_DIR"
|
||||||
|
log_info "版本: $VERSION"
|
||||||
|
|
||||||
|
# 步骤 1: 设置构建器
|
||||||
|
setup_buildx
|
||||||
|
|
||||||
|
# 步骤 2: 加载基础镜像
|
||||||
|
load_base_images
|
||||||
|
|
||||||
|
# 步骤 3: 定义服务列表
|
||||||
|
declare -A SERVICES=(
|
||||||
|
["database"]="datamate-database:scripts/images/database/Dockerfile"
|
||||||
|
["gateway"]="datamate-gateway:scripts/images/gateway/Dockerfile"
|
||||||
|
["backend"]="datamate-backend:scripts/images/backend/Dockerfile"
|
||||||
|
["frontend"]="datamate-frontend:scripts/images/frontend/Dockerfile"
|
||||||
|
["runtime"]="datamate-runtime:scripts/images/runtime/Dockerfile"
|
||||||
|
["backend-python"]="datamate-backend-python:scripts/images/backend-python/Dockerfile"
|
||||||
|
["deer-flow-backend"]="deer-flow-backend:scripts/images/deer-flow-backend/Dockerfile"
|
||||||
|
["deer-flow-frontend"]="deer-flow-frontend:scripts/images/deer-flow-frontend/Dockerfile"
|
||||||
|
["mineru"]="datamate-mineru:scripts/images/mineru/Dockerfile"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 步骤 4: 批量构建
|
||||||
|
log_info ""
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "开始批量构建"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
local failed=()
|
||||||
|
local succeeded=()
|
||||||
|
|
||||||
|
for service_name in "${!SERVICES[@]}"; do
|
||||||
|
IFS=':' read -r image_name dockerfile <<< "${SERVICES[$service_name]}"
|
||||||
|
|
||||||
|
if offline_build "$service_name" "$image_name" "$dockerfile"; then
|
||||||
|
succeeded+=("$service_name")
|
||||||
|
else
|
||||||
|
failed+=("$service_name")
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
# 步骤 5: 汇总结果
|
||||||
|
log_info "======================================"
|
||||||
|
log_info "构建完成"
|
||||||
|
log_info "======================================"
|
||||||
|
|
||||||
|
if [ ${#succeeded[@]} -gt 0 ]; then
|
||||||
|
log_info "成功 (${#succeeded[@]}): ${succeeded[*]}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${#failed[@]} -gt 0 ]; then
|
||||||
|
log_error "失败 (${#failed[@]}): ${failed[*]}"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
log_info "✓ 所有服务构建成功!"
|
||||||
|
echo ""
|
||||||
|
log_info "镜像列表:"
|
||||||
|
docker images --format "table {{.Repository}}:{{.Tag}}\t{{.Size}}" | grep -E "(datamate-|deer-flow-)" || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# 执行主流程
|
||||||
|
main "$@"
|
||||||
109
scripts/offline/build-offline.sh
Normal file
109
scripts/offline/build-offline.sh
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# BuildKit 离线构建脚本 - 在无网环境执行
|
||||||
|
# Usage: ./build-offline.sh [cache-dir] [version]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CACHE_DIR="${1:-./build-cache}"
|
||||||
|
VERSION="${2:-latest}"
|
||||||
|
BUILDKIT_CACHE_DIR="$CACHE_DIR/buildkit"
|
||||||
|
IMAGES_DIR="$CACHE_DIR/images"
|
||||||
|
RESOURCES_DIR="$CACHE_DIR/resources"
|
||||||
|
|
||||||
|
# 检查缓存目录
|
||||||
|
if [ ! -d "$CACHE_DIR" ]; then
|
||||||
|
echo "错误: 缓存目录 $CACHE_DIR 不存在"
|
||||||
|
echo "请先解压缓存包: tar -xzf build-cache-*.tar.gz"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 确保 buildx 构建器存在
|
||||||
|
if ! docker buildx inspect offline-builder > /dev/null 2>&1; then
|
||||||
|
echo "创建 buildx 构建器..."
|
||||||
|
docker buildx create --name offline-builder --driver docker-container --use
|
||||||
|
else
|
||||||
|
docker buildx use offline-builder
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "======================================"
|
||||||
|
echo "1. 加载基础镜像"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
if [ -f "$IMAGES_DIR/base-images.tar" ]; then
|
||||||
|
echo "从 $IMAGES_DIR/base-images.tar 加载基础镜像..."
|
||||||
|
docker load -i "$IMAGES_DIR/base-images.tar"
|
||||||
|
echo "✓ 基础镜像加载完成"
|
||||||
|
else
|
||||||
|
echo "警告: 基础镜像文件不存在,假设镜像已存在"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "2. 离线构建服务"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
# 定义服务配置(与 export-cache.sh 保持一致)
|
||||||
|
SERVICES=(
|
||||||
|
"backend:datamate-backend:scripts/images/backend/Dockerfile"
|
||||||
|
"backend-python:datamate-backend-python:scripts/images/backend-python/Dockerfile"
|
||||||
|
"database:datamate-database:scripts/images/database/Dockerfile"
|
||||||
|
"frontend:datamate-frontend:scripts/images/frontend/Dockerfile"
|
||||||
|
"gateway:datamate-gateway:scripts/images/gateway/Dockerfile"
|
||||||
|
"runtime:datamate-runtime:scripts/images/runtime/Dockerfile"
|
||||||
|
"deer-flow-backend:deer-flow-backend:scripts/images/deer-flow-backend/Dockerfile"
|
||||||
|
"deer-flow-frontend:deer-flow-frontend:scripts/images/deer-flow-frontend/Dockerfile"
|
||||||
|
"mineru:datamate-mineru:scripts/images/mineru/Dockerfile"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 检查是否有资源目录需要挂载
|
||||||
|
MOUNT_ARGS=""
|
||||||
|
if [ -d "$RESOURCES_DIR" ]; then
|
||||||
|
echo "检测到资源目录,将用于本地资源挂载"
|
||||||
|
MOUNT_ARGS="--build-arg RESOURCES_DIR=$RESOURCES_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
for service_config in "${SERVICES[@]}"; do
|
||||||
|
IFS=':' read -r service_name image_name dockerfile <<< "$service_config"
|
||||||
|
cache_file="$BUILDKIT_CACHE_DIR/$service_name-cache"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "--------------------------------------"
|
||||||
|
echo "构建 [$service_name] -> $image_name:$VERSION"
|
||||||
|
echo "--------------------------------------"
|
||||||
|
|
||||||
|
if [ ! -d "$cache_file" ]; then
|
||||||
|
echo "警告: $service_name 的缓存不存在,跳过..."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 使用缓存进行离线构建
|
||||||
|
# --pull=false: 不尝试拉取镜像
|
||||||
|
# --network=none: 禁用网络访问
|
||||||
|
docker buildx build \
|
||||||
|
--cache-from "type=local,src=$cache_file" \
|
||||||
|
--pull=false \
|
||||||
|
--network=none \
|
||||||
|
-f "$dockerfile" \
|
||||||
|
-t "$image_name:$VERSION" \
|
||||||
|
--load \
|
||||||
|
. 2>&1 || {
|
||||||
|
echo "警告: $service_name 离线构建遇到问题,尝试仅使用缓存..."
|
||||||
|
docker buildx build \
|
||||||
|
--cache-from "type=local,src=$cache_file" \
|
||||||
|
--pull=false \
|
||||||
|
-f "$dockerfile" \
|
||||||
|
-t "$image_name:$VERSION" \
|
||||||
|
--load \
|
||||||
|
. 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "✓ $service_name 构建完成"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "✓ 离线构建完成!"
|
||||||
|
echo "======================================"
|
||||||
|
echo ""
|
||||||
|
echo "构建的镜像列表:"
|
||||||
|
docker images | grep -E "(datamate-|deer-flow-)" || true
|
||||||
151
scripts/offline/diagnose.sh
Normal file
151
scripts/offline/diagnose.sh
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# 离线构建诊断脚本
|
||||||
|
# Usage: ./diagnose.sh [cache-dir]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CACHE_DIR="${1:-./build-cache}"
|
||||||
|
|
||||||
|
echo "======================================"
|
||||||
|
echo "离线构建环境诊断"
|
||||||
|
echo "======================================"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 1. 检查 Docker
|
||||||
|
echo "1. Docker 版本:"
|
||||||
|
docker version --format '{{.Server.Version}}' 2>/dev/null || echo " 无法获取版本"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 2. 检查 BuildKit
|
||||||
|
echo "2. BuildKit 状态:"
|
||||||
|
if docker buildx version > /dev/null 2>&1; then
|
||||||
|
docker buildx version
|
||||||
|
echo ""
|
||||||
|
echo "可用的构建器:"
|
||||||
|
docker buildx ls
|
||||||
|
else
|
||||||
|
echo " BuildKit 不可用"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 3. 检查缓存目录
|
||||||
|
echo "3. 缓存目录检查 ($CACHE_DIR):"
|
||||||
|
if [ -d "$CACHE_DIR" ]; then
|
||||||
|
echo " ✓ 缓存目录存在"
|
||||||
|
|
||||||
|
# 检查子目录
|
||||||
|
for subdir in buildkit images resources; do
|
||||||
|
if [ -d "$CACHE_DIR/$subdir" ]; then
|
||||||
|
echo " ✓ $subdir/ 存在"
|
||||||
|
count=$(find "$CACHE_DIR/$subdir" -type d | wc -l)
|
||||||
|
echo " 子目录数量: $count"
|
||||||
|
else
|
||||||
|
echo " ✗ $subdir/ 不存在"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo " ✗ 缓存目录不存在"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 4. 检查基础镜像
|
||||||
|
echo "4. 基础镜像检查:"
|
||||||
|
required_images=(
|
||||||
|
"maven:3-eclipse-temurin-21"
|
||||||
|
"maven:3-eclipse-temurin-8"
|
||||||
|
"eclipse-temurin:21-jdk"
|
||||||
|
"mysql:8"
|
||||||
|
"node:20-alpine"
|
||||||
|
"nginx:1.29"
|
||||||
|
"ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm"
|
||||||
|
"python:3.12-slim"
|
||||||
|
"gcr.io/distroless/nodejs20-debian12"
|
||||||
|
)
|
||||||
|
|
||||||
|
missing_images=()
|
||||||
|
for img in "${required_images[@]}"; do
|
||||||
|
if docker inspect "$img" > /dev/null 2>&1; then
|
||||||
|
size=$(docker images --format "{{.Size}}" "$img" | head -1)
|
||||||
|
echo " ✓ $img ($size)"
|
||||||
|
else
|
||||||
|
echo " ✗ $img (缺失)"
|
||||||
|
missing_images+=("$img")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 5. 检查 BuildKit 缓存
|
||||||
|
echo "5. BuildKit 缓存检查:"
|
||||||
|
if [ -d "$CACHE_DIR/buildkit" ]; then
|
||||||
|
for cache_dir in "$CACHE_DIR/buildkit"/*-cache; do
|
||||||
|
if [ -d "$cache_dir" ]; then
|
||||||
|
name=$(basename "$cache_dir")
|
||||||
|
size=$(du -sh "$cache_dir" 2>/dev/null | cut -f1)
|
||||||
|
echo " ✓ $name ($size)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo " ✗ 缓存目录不存在"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 6. 检查资源文件
|
||||||
|
echo "6. 外部资源检查:"
|
||||||
|
if [ -d "$CACHE_DIR/resources" ]; then
|
||||||
|
if [ -f "$CACHE_DIR/resources/models/ch_ppocr_mobile_v2.0_cls_infer.tar" ]; then
|
||||||
|
size=$(du -sh "$CACHE_DIR/resources/models/ch_ppocr_mobile_v2.0_cls_infer.tar" | cut -f1)
|
||||||
|
echo " ✓ PaddleOCR 模型 ($size)"
|
||||||
|
else
|
||||||
|
echo " ✗ PaddleOCR 模型缺失"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "$CACHE_DIR/resources/models/zh_core_web_sm-3.8.0-py3-none-any.whl" ]; then
|
||||||
|
size=$(du -sh "$CACHE_DIR/resources/models/zh_core_web_sm-3.8.0-py3-none-any.whl" | cut -f1)
|
||||||
|
echo " ✓ spaCy 模型 ($size)"
|
||||||
|
else
|
||||||
|
echo " ✗ spaCy 模型缺失"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -d "$CACHE_DIR/resources/DataX" ]; then
|
||||||
|
echo " ✓ DataX 源码"
|
||||||
|
else
|
||||||
|
echo " ✗ DataX 源码缺失"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -d "$CACHE_DIR/resources/deer-flow" ]; then
|
||||||
|
echo " ✓ deer-flow 源码"
|
||||||
|
else
|
||||||
|
echo " ✗ deer-flow 源码缺失"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo " ✗ 资源目录不存在"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 7. 网络检查
|
||||||
|
echo "7. 网络检查:"
|
||||||
|
if ping -c 1 8.8.8.8 > /dev/null 2>&1; then
|
||||||
|
echo " ⚠ 网络可用(离线构建环境通常不需要)"
|
||||||
|
else
|
||||||
|
echo " ✓ 网络不可达(符合离线环境)"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 8. 总结
|
||||||
|
echo "======================================"
|
||||||
|
echo "诊断总结"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
if [ ${#missing_images[@]} -eq 0 ]; then
|
||||||
|
echo "✓ 所有基础镜像已就绪"
|
||||||
|
else
|
||||||
|
echo "✗ 缺少 ${#missing_images[@]} 个基础镜像:"
|
||||||
|
printf ' - %s\n' "${missing_images[@]}"
|
||||||
|
echo ""
|
||||||
|
echo "修复方法:"
|
||||||
|
if [ -f "$CACHE_DIR/images/base-images.tar" ]; then
|
||||||
|
echo " docker load -i $CACHE_DIR/images/base-images.tar"
|
||||||
|
else
|
||||||
|
echo " 请确保有网环境导出时包含所有基础镜像"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
172
scripts/offline/export-cache.sh
Normal file
172
scripts/offline/export-cache.sh
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# BuildKit 缓存导出脚本 - 在有网环境执行
|
||||||
|
# Usage: ./export-cache.sh [output-dir]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
OUTPUT_DIR="${1:-./build-cache}"
|
||||||
|
BUILDKIT_CACHE_DIR="$OUTPUT_DIR/buildkit"
|
||||||
|
IMAGES_DIR="$OUTPUT_DIR/images"
|
||||||
|
RESOURCES_DIR="$OUTPUT_DIR/resources"
|
||||||
|
APT_CACHE_DIR="$OUTPUT_DIR/apt-cache"
|
||||||
|
|
||||||
|
# 确保 buildx 构建器存在
|
||||||
|
if ! docker buildx inspect offline-builder > /dev/null 2>&1; then
|
||||||
|
echo "创建 buildx 构建器..."
|
||||||
|
docker buildx create --name offline-builder --driver docker-container --use
|
||||||
|
else
|
||||||
|
docker buildx use offline-builder
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "$BUILDKIT_CACHE_DIR" "$IMAGES_DIR" "$RESOURCES_DIR" "$APT_CACHE_DIR"
|
||||||
|
|
||||||
|
echo "======================================"
|
||||||
|
echo "1. 导出基础镜像"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
BASE_IMAGES=(
|
||||||
|
"maven:3-eclipse-temurin-21"
|
||||||
|
"maven:3-eclipse-temurin-8"
|
||||||
|
"eclipse-temurin:21-jdk"
|
||||||
|
"mysql:8"
|
||||||
|
"node:20-alpine"
|
||||||
|
"nginx:1.29"
|
||||||
|
"ghcr.nju.edu.cn/astral-sh/uv:python3.11-bookworm"
|
||||||
|
"ghcr.nju.edu.cn/astral-sh/uv:python3.12-bookworm"
|
||||||
|
"ghcr.nju.edu.cn/astral-sh/uv:latest"
|
||||||
|
"python:3.12-slim"
|
||||||
|
"python:3.11-slim"
|
||||||
|
"gcr.nju.edu.cn/distroless/nodejs20-debian12"
|
||||||
|
)
|
||||||
|
|
||||||
|
for img in "${BASE_IMAGES[@]}"; do
|
||||||
|
echo "拉取: $img"
|
||||||
|
docker pull "$img" || echo "警告: $img 拉取失败,可能已存在"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "保存基础镜像到 $IMAGES_DIR/base-images.tar..."
|
||||||
|
docker save -o "$IMAGES_DIR/base-images.tar" "${BASE_IMAGES[@]}"
|
||||||
|
echo "✓ 基础镜像保存完成"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "2. 导出 BuildKit 构建缓存"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
# 定义服务配置
|
||||||
|
SERVICES=(
|
||||||
|
"backend:datamate-backend:scripts/images/backend/Dockerfile"
|
||||||
|
"backend-python:datamate-backend-python:scripts/images/backend-python/Dockerfile"
|
||||||
|
"database:datamate-database:scripts/images/database/Dockerfile"
|
||||||
|
"frontend:datamate-frontend:scripts/images/frontend/Dockerfile"
|
||||||
|
"gateway:datamate-gateway:scripts/images/gateway/Dockerfile"
|
||||||
|
"runtime:datamate-runtime:scripts/images/runtime/Dockerfile"
|
||||||
|
"deer-flow-backend:deer-flow-backend:scripts/images/deer-flow-backend/Dockerfile"
|
||||||
|
"deer-flow-frontend:deer-flow-frontend:scripts/images/deer-flow-frontend/Dockerfile"
|
||||||
|
"mineru:datamate-mineru:scripts/images/mineru/Dockerfile"
|
||||||
|
)
|
||||||
|
|
||||||
|
for service_config in "${SERVICES[@]}"; do
|
||||||
|
IFS=':' read -r service_name image_name dockerfile <<< "$service_config"
|
||||||
|
cache_file="$BUILDKIT_CACHE_DIR/$service_name-cache"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "导出 [$service_name] 缓存到 $cache_file..."
|
||||||
|
|
||||||
|
# 先正常构建以填充缓存
|
||||||
|
docker buildx build \
|
||||||
|
--cache-to "type=local,dest=$cache_file,mode=max" \
|
||||||
|
-f "$dockerfile" \
|
||||||
|
-t "$image_name:cache" \
|
||||||
|
. || echo "警告: $service_name 缓存导出失败"
|
||||||
|
|
||||||
|
echo "✓ $service_name 缓存导出完成"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "3. 预下载外部资源"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
# PaddleOCR 模型
|
||||||
|
mkdir -p "$RESOURCES_DIR/models"
|
||||||
|
if [ ! -f "$RESOURCES_DIR/models/ch_ppocr_mobile_v2.0_cls_infer.tar" ]; then
|
||||||
|
echo "下载 PaddleOCR 模型..."
|
||||||
|
wget -O "$RESOURCES_DIR/models/ch_ppocr_mobile_v2.0_cls_infer.tar" \
|
||||||
|
"https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar" || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# spaCy 模型
|
||||||
|
if [ ! -f "$RESOURCES_DIR/models/zh_core_web_sm-3.8.0-py3-none-any.whl" ]; then
|
||||||
|
echo "下载 spaCy 模型..."
|
||||||
|
wget -O "$RESOURCES_DIR/models/zh_core_web_sm-3.8.0-py3-none-any.whl" \
|
||||||
|
"https://ghproxy.net/https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl" || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# DataX 源码
|
||||||
|
if [ ! -d "$RESOURCES_DIR/DataX" ]; then
|
||||||
|
echo "克隆 DataX 源码..."
|
||||||
|
git clone --depth 1 "https://gitee.com/alibaba/DataX.git" "$RESOURCES_DIR/DataX" || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# deer-flow 源码(用于 deer-flow 构建)
|
||||||
|
if [ ! -d "$RESOURCES_DIR/deer-flow" ]; then
|
||||||
|
echo "克隆 deer-flow 源码..."
|
||||||
|
git clone --depth 1 "https://ghproxy.net/https://github.com/ModelEngine-Group/deer-flow.git" "$RESOURCES_DIR/deer-flow" || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "4. 导出 APT 缓存"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
# 为需要 apt 的镜像预生成 apt 缓存
|
||||||
|
echo "生成 APT list 缓存..."
|
||||||
|
|
||||||
|
# eclipse-temurin:21-jdk 的 apt 缓存
|
||||||
|
docker run --rm \
|
||||||
|
-v "$APT_CACHE_DIR/eclipse-temurin:/var/cache/apt/archives" \
|
||||||
|
-v "$APT_CACHE_DIR/eclipse-temurin-lists:/var/lib/apt/lists" \
|
||||||
|
eclipse-temurin:21-jdk \
|
||||||
|
bash -c "apt-get update && apt-get install -y --download-only vim wget curl rsync python3 python3-pip python-is-python3 dos2unix libreoffice fonts-noto-cjk 2>/dev/null || true" 2>/dev/null || echo " Warning: eclipse-temurin apt 缓存导出失败"
|
||||||
|
|
||||||
|
# python:3.12-slim 的 apt 缓存
|
||||||
|
docker run --rm \
|
||||||
|
-v "$APT_CACHE_DIR/python312:/var/cache/apt/archives" \
|
||||||
|
-v "$APT_CACHE_DIR/python312-lists:/var/lib/apt/lists" \
|
||||||
|
python:3.12-slim \
|
||||||
|
bash -c "apt-get update && apt-get install -y --download-only vim openjdk-21-jre nfs-common glusterfs-client rsync 2>/dev/null || true" 2>/dev/null || echo " Warning: python3.12 apt 缓存导出失败"
|
||||||
|
|
||||||
|
# python:3.11-slim 的 apt 缓存
|
||||||
|
docker run --rm \
|
||||||
|
-v "$APT_CACHE_DIR/python311:/var/cache/apt/archives" \
|
||||||
|
-v "$APT_CACHE_DIR/python311-lists:/var/lib/apt/lists" \
|
||||||
|
python:3.11-slim \
|
||||||
|
bash -c "apt-get update && apt-get install -y --download-only curl vim libgl1 libglx0 libopengl0 libglib2.0-0 procps 2>/dev/null || true" 2>/dev/null || echo " Warning: python3.11 apt 缓存导出失败"
|
||||||
|
|
||||||
|
echo "✓ APT 缓存导出完成"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "5. 打包缓存"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
cd "$OUTPUT_DIR"
|
||||||
|
tar -czf "build-cache-$(date +%Y%m%d).tar.gz" buildkit images resources apt-cache
|
||||||
|
cd - > /dev/null
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "✓ 缓存导出完成!"
|
||||||
|
echo "======================================"
|
||||||
|
echo "缓存位置: $OUTPUT_DIR"
|
||||||
|
echo "传输文件: $OUTPUT_DIR/build-cache-$(date +%Y%m%d).tar.gz"
|
||||||
|
echo ""
|
||||||
|
echo "包含内容:"
|
||||||
|
echo " - 基础镜像 (images/)"
|
||||||
|
echo " - BuildKit 缓存 (buildkit/)"
|
||||||
|
echo " - 外部资源 (resources/)"
|
||||||
|
echo " - APT 缓存 (apt-cache/)"
|
||||||
|
echo ""
|
||||||
|
echo "请将此压缩包传输到无网环境后解压使用"
|
||||||
Reference in New Issue
Block a user