feature: 更新算子名称;增加创建任务、模板校验 (#57)

* feature: 更新算子名称;增加创建任务、模板校验

* feature: 镜像构建增加缓存
This commit is contained in:
hhhhsc701
2025-11-05 17:38:03 +08:00
committed by GitHub
parent 6917ae5b30
commit 05b26a2981
16 changed files with 56 additions and 40 deletions

View File

@@ -30,11 +30,13 @@ public class CleanTaskValidator {
for (int i = 1; i < operators.size(); i++) { for (int i = 1; i < operators.size(); i++) {
OperatorInstanceDto front = operators.get(i - 1); OperatorInstanceDto front = operators.get(i - 1);
OperatorInstanceDto back = operators.get(i); OperatorInstanceDto back = operators.get(i);
if (!StringUtils.equals(front.getOutputs(), back.getInputs())) { if (StringUtils.equals(front.getOutputs(), back.getInputs()) || StringUtils.equalsAny("multimodal",
front.getOutputs(), back.getOutputs())) {
continue;
}
throw BusinessException.of(CleanErrorCode.IN_AND_OUT_NOT_MATCH, throw BusinessException.of(CleanErrorCode.IN_AND_OUT_NOT_MATCH,
String.format(Locale.ROOT, "ops(name: [%s, %s]) inputs and outputs does not match", String.format(Locale.ROOT, "ops(name: [%s, %s]) inputs and outputs does not match",
front.getName(), back.getName())); front.getName(), back.getName()));
} }
} }
} }
}

View File

@@ -39,7 +39,7 @@ spec:
{{- toYaml . | nindent 12 }} {{- toYaml . | nindent 12 }}
{{- end }} {{- end }}
image: "{{ include "backend.image" . }}" image: "{{ include "backend.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports: ports:
- name: http - name: http
containerPort: {{ .Values.service.port }} containerPort: {{ .Values.service.port }}

View File

@@ -67,9 +67,5 @@ Name of image
{{- define "database.image" -}} {{- define "database.image" -}}
{{- $name := default .Values.image.repository .Values.global.image.database.name }} {{- $name := default .Values.image.repository .Values.global.image.database.name }}
{{- $tag := default .Values.image.tag .Values.global.image.database.tag }} {{- $tag := default .Values.image.tag .Values.global.image.database.tag }}
{{- if .Values.global.image.repository }}
{{- .Values.global.image.repository | trimSuffix "/" }}/{{ $name }}:{{ $tag }}
{{- else }}
{{- $name }}:{{ $tag }} {{- $name }}:{{ $tag }}
{{- end }} {{- end }}
{{- end }}

View File

@@ -61,7 +61,7 @@ spec:
{{- toYaml . | nindent 12 }} {{- toYaml . | nindent 12 }}
{{- end }} {{- end }}
image: "{{ include "database.image" . }}" image: "{{ include "database.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports: ports:
- name: http - name: http
containerPort: {{ .Values.service.port }} containerPort: {{ .Values.service.port }}

View File

@@ -40,7 +40,7 @@ spec:
{{- toYaml . | nindent 12 }} {{- toYaml . | nindent 12 }}
{{- end }} {{- end }}
image: "{{ include "frontend.image" . }}" image: "{{ include "frontend.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports: ports:
- name: http - name: http
containerPort: {{ .Values.service.port }} containerPort: {{ .Values.service.port }}

View File

@@ -3,6 +3,8 @@
# Declare variables to be passed into your templates. # Declare variables to be passed into your templates.
global: global:
deerFlow:
enable: false
image: image:
repository: "" repository: ""
pullPolicy: "IfNotPresent" pullPolicy: "IfNotPresent"

View File

@@ -39,7 +39,7 @@ spec:
{{- toYaml . | nindent 12 }} {{- toYaml . | nindent 12 }}
{{- end }} {{- end }}
image: "{{ include "backend.image" . }}" image: "{{ include "backend.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports: ports:
- name: http - name: http
containerPort: {{ .Values.service.port }} containerPort: {{ .Values.service.port }}

View File

@@ -39,7 +39,7 @@ spec:
{{- toYaml . | nindent 12 }} {{- toYaml . | nindent 12 }}
{{- end }} {{- end }}
image: "{{ include "frontend.image" . }}" image: "{{ include "frontend.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports: ports:
- name: http - name: http
containerPort: {{ .Values.service.port }} containerPort: {{ .Values.service.port }}

View File

@@ -21,7 +21,7 @@ def _import_operators():
from . import file_exporter from . import file_exporter
from . import slide_formatter from . import slide_formatter
from . import unstructured_formatter from . import unstructured_formatter
from . import external_pdf_formatter from . import mineru_formatter
_import_operators() _import_operators()

View File

@@ -2,5 +2,5 @@
from datamate.core.base_op import OPERATORS from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ExternalPDFFormatter', OPERATORS.register_module(module_name='MineruFormatter',
module_path="ops.formatter.external_pdf_formatter.process") module_path="ops.formatter.external_pdf_formatter.process")

View File

@@ -1,10 +1,10 @@
name: 'MinerU PDF文本抽取' name: 'MinerU PDF文本抽取'
name_en: 'External PDF Text Extraction' name_en: 'MinerU PDF Text Extraction'
description: '基于MinerU API,抽取PDF中的文本。' description: '基于MinerU API,抽取PDF中的文本。'
description_en: 'Extracts text from PDF files based on MinerU API.' description_en: 'Extracts text from PDF files based on MinerU API.'
language: 'python' language: 'python'
vendor: 'huawei' vendor: 'huawei'
raw_id: 'ExternalPDFFormatter' raw_id: 'MineruFormatter'
version: '1.0.0' version: '1.0.0'
types: types:
- 'collect' - 'collect'

View File

@@ -15,11 +15,11 @@ from datamate.core.base_op import Mapper
from datamate.common.utils.rest_client import http_request from datamate.common.utils.rest_client import http_request
class ExternalPDFFormatter(Mapper): class MineruFormatter(Mapper):
"""基于外部API,抽取PDF中的文本""" """基于外部API,抽取PDF中的文本"""
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(ExternalPDFFormatter, self).__init__(*args, **kwargs) super(MineruFormatter, self).__init__(*args, **kwargs)
self.base_url = os.getenv("EXTERNAL_PDF_BASE_URL", "http://datamate-mineru:9001") self.base_url = os.getenv("EXTERNAL_PDF_BASE_URL", "http://datamate-mineru:9001")
self.pdf_extract_url = f"{self.base_url}/api/pdf-extract" self.pdf_extract_url = f"{self.base_url}/api/pdf-extract"
@@ -31,8 +31,8 @@ class ExternalPDFFormatter(Mapper):
response = http_request(method="POST", url=self.pdf_extract_url, data=data) response = http_request(method="POST", url=self.pdf_extract_url, data=data)
sample[self.text_key] = json.loads(response.text).get("result") sample[self.text_key] = json.loads(response.text).get("result")
logger.info( logger.info(
f"fileName: {filename}, method: ExternalPDFFormatter costs {(time.time() - start):6f} s") f"fileName: {filename}, method: MineruFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err: except UnicodeDecodeError as err:
logger.exception(f"fileName: {filename}, method: ExternalPDFFormatter causes decode error: {err}") logger.exception(f"fileName: {filename}, method: MineruFormatter causes decode error: {err}")
raise raise
return sample return sample

View File

@@ -1,4 +1,4 @@
name: '非结构化文本抽取' name: 'Unstructured文本抽取'
name_en: 'Unstructured Text Extraction' name_en: 'Unstructured Text Extraction'
description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。' description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。'
description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.' description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.'

View File

@@ -51,7 +51,7 @@ class LazyLoader(ModuleType):
def __init__(self, def __init__(self,
package_name, package_name,
module_name=None, module_name=None,
whl_path="/dataset/ops_whl", whl_path=None,
exact_version=None, exact_version=None,
force_reinstall=False force_reinstall=False
): ):
@@ -72,7 +72,7 @@ class LazyLoader(ModuleType):
self._module_name = module_name if module_name else package_name self._module_name = module_name if module_name else package_name
self._package_name = package_name self._package_name = package_name
self.whl_path = Path(whl_path).resolve() self.whl_path = whl_path
self.exact_version = exact_version self.exact_version = exact_version
self.force_reinstall = force_reinstall self.force_reinstall = force_reinstall
@@ -126,7 +126,10 @@ class LazyLoader(ModuleType):
need_install = True need_install = True
if need_install: if need_install:
self._pip_install_package(package_name) if self.whl_path is None:
self._pip_install_package_pypi(package_name)
else:
self._pip_install_package_local(package_name)
module = importlib.import_module(module_name) module = importlib.import_module(module_name)
self._cached_module = module self._cached_module = module
self._register_alias(module) self._register_alias(module)
@@ -168,13 +171,26 @@ class LazyLoader(ModuleType):
return line.split()[-1] return line.split()[-1]
raise PackageNotFoundError() raise PackageNotFoundError()
def _pip_install_package(self, package_name: str): def _pip_install_package_pypi(self, package_name: str):
if self.exact_version:
package_name += f"=={self.exact_version}"
try:
subprocess.check_call([
sys.executable, "-m", "pip", "install", str(package_name)
], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
logger.info(f"Successfully installed {package_name}")
except subprocess.CalledProcessError as e:
logger.error(f"Installation failed: {e}")
raise RuntimeError(f"Installation failed: {e}") from e
def _pip_install_package_local(self, package_name: str):
"""安装逻辑 """ """安装逻辑 """
if not self.whl_path.exists(): whl_path = Path(self.whl_path).resolve()
if not whl_path.exists():
raise FileNotFoundError(f"WHL directory not found: {self.whl_path}") raise FileNotFoundError(f"WHL directory not found: {self.whl_path}")
whl_files = list(self.whl_path.glob(f"{package_name}*.whl")) whl_files = list(whl_path.glob(f"{package_name}*.whl"))
if not whl_files: if not whl_files:
raise RuntimeError(f"No WHL files found for {package_name}") raise RuntimeError(f"No WHL files found for {package_name}")

View File

@@ -70,8 +70,8 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined'
INSERT IGNORE INTO t_operator INSERT IGNORE INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star) (id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false), VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。', '1.0.0', 'text', 'text', null, null, '', false), ('UnstructuredFormatter', 'Unstructured文本抽取', '基于Unstructured抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。', '1.0.0', 'text', 'text', null, null, '', false),
('ExternalPDFFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false), ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false), ('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'), ('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'), ('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),

View File

@@ -1,4 +1,4 @@
FROM python:3.11 FROM ghcr.io/astral-sh/uv:python3.11-bookworm
COPY runtime/python-executor /opt/runtime COPY runtime/python-executor /opt/runtime
COPY runtime/ops /opt/runtime/datamate/ops COPY runtime/ops /opt/runtime/datamate/ops
@@ -7,16 +7,16 @@ COPY scripts/images/runtime/start.sh /opt/runtime/start.sh
ENV PYTHONPATH=/opt/runtime/datamate/ ENV PYTHONPATH=/opt/runtime/datamate/
RUN apt update \ RUN --mount=type=cache,target=/var/cache/apt \
&& apt install -y libgl1 libglib2.0-0 vim libmagic1t64 libreoffice dos2unix \ --mount=type=cache,target=/var/lib/apt \
&& apt clean \ apt update \
&& rm -rf /var/lib/apt/lists/* && apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix
WORKDIR /opt/runtime WORKDIR /opt/runtime
RUN pip install -e . --trusted-host mirrors.huaweicloud.com -i https://mirrors.huaweicloud.com/repository/pypi/simple \ RUN --mount=type=cache,target=/root/.cache/uv \
&& pip install -r /opt/runtime/datamate/ops/requirements.txt --trusted-host mirrors.huaweicloud.com -i https://mirrors.huaweicloud.com/repository/pypi/simple \ uv pip install -e . --system \
&& pip cache purge && uv pip install -r /opt/runtime/datamate/ops/requirements.txt --system
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
&& chmod +x /opt/runtime/start.sh \ && chmod +x /opt/runtime/start.sh \