feature: add mysql collection and starrocks collection (#222)

* fix: fix the path for backend-python imaage building

* feature: add mysql collection and starrocks collection

* feature: add mysql collection and starrocks collection

* fix: change the permission of those files which collected from nfs to 754

* fix: delete collected files, config files and log files while deleting collection task

* fix: add the collection task detail api

* fix: change the log of collecting for dataset

* fix: add collection task selecting while creating and updating dataset

* fix: set the umask value to 0022 for java process
This commit is contained in:
hefanli
2026-01-04 19:05:08 +08:00
committed by GitHub
parent 8d61eb28c3
commit ccfb84c034
13 changed files with 208 additions and 115 deletions

View File

@@ -13,9 +13,10 @@ from app.module.shared.schema import TaskStatus
logger = get_logger(__name__)
class DataxClient:
def __init__(self, task: CollectionTask, execution: TaskExecution):
def __init__(self, task: CollectionTask, execution: TaskExecution, template: CollectionTemplate):
self.execution = execution
self.task = task
self.template = template
self.config_file_path = f"/flow/data-collection/{task.id}/config.json"
self.python_path = "python"
self.datax_main = "/opt/datax/bin/datax.py"
@@ -53,10 +54,21 @@ class DataxClient:
**(task_config.parameter if task_config.parameter else {}),
**(task_config.reader if task_config.reader else {})
}
dest_parameter = {}
if template.target_type == "txtfilewriter":
dest_parameter = {
"path": target_path,
"fileName": "collection_result",
"writeMode": "truncate"
}
elif template.target_type == "nfswriter" or template.target_type == "obswriter":
dest_parameter = {
"destPath": target_path
}
writer_parameter = {
**(task_config.parameter if task_config.parameter else {}),
**(task_config.writer if task_config.writer else {}),
"destPath": target_path
**dest_parameter
}
# 生成任务运行配置
job_config = {
@@ -128,6 +140,7 @@ class DataxClient:
logger.info(f"DataX 任务执行成功: {self.execution.id}")
logger.info(f"执行耗时: {self.execution.duration_seconds:.2f}")
self.execution.status = TaskStatus.COMPLETED.name
self.rename_collection_result()
else:
self.execution.error_message = self.execution.error_message or f"DataX 任务执行失败,退出码: {exit_code}"
self.execution.status = TaskStatus.FAILED.name
@@ -141,6 +154,23 @@ class DataxClient:
if self.task.sync_mode == SyncMode.ONCE:
self.task.status = self.execution.status
def rename_collection_result(self):
if self.template.target_type != "txtfilewriter":
return
target_path = Path(self.task.target_path)
if not target_path.exists():
logger.warning(f"Target path does not exist: {target_path}")
return
# If it's a directory, find all files without extensions
for file_path in target_path.iterdir():
if file_path.is_file() and not file_path.suffix:
new_path = file_path.with_suffix('.csv')
try:
file_path.rename(new_path)
logger.info(f"Renamed {file_path} to {new_path}")
except Exception as e:
logger.error(f"Failed to rename {file_path} to {new_path}: {str(e)}")
def _run_process(self, cmd: list[str], log_f) -> int:
# 启动进程
process = subprocess.Popen(

View File

@@ -1,5 +1,7 @@
import math
import uuid
import shutil
import os
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
@@ -138,6 +140,13 @@ async def delete_collection_tasks(
.where(TaskExecution.task_id == task_id)
)
target_path = f"/dataset/local/{task_id}"
if os.path.exists(target_path):
shutil.rmtree(target_path)
job_path = f"/flow/data-collection/{task_id}"
if os.path.exists(job_path):
shutil.rmtree(job_path)
# 删除任务
await db.delete(task)
await db.commit()
@@ -155,3 +164,29 @@ async def delete_collection_tasks(
await db.rollback()
logger.error(f"Failed to delete collection task: {str(e)}")
raise HTTPException(status_code=500, detail="Internal server error")
@router.get("/{task_id}", response_model=StandardResponse[CollectionTaskBase])
async def get_task(
task_id: str,
db: AsyncSession = Depends(get_db)
):
"""获取归集任务详情"""
try:
# Query the task by ID
task = await db.get(CollectionTask, task_id)
if not task:
raise HTTPException(
status_code=404,
detail=f"Task with ID {task_id} not found"
)
return StandardResponse(
code=200,
message="Success",
data=converter_to_response(task)
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to get task {task_id}: {str(e)}", e)
raise HTTPException(status_code=500, detail="Internal server error")

View File

@@ -58,6 +58,7 @@ class CollectionTaskService:
logger.error(f"task {task_id} not exist")
return
template = await session.execute(select(CollectionTemplate).where(CollectionTemplate.id == task.template_id))
template = template.scalar_one_or_none()
if not template:
logger.error(f"template {task.template_name} not exist")
return
@@ -65,6 +66,6 @@ class CollectionTaskService:
session.add(task_execution)
await session.commit()
await asyncio.to_thread(
DataxClient(execution=task_execution, task=task).run_datax_job
DataxClient(execution=task_execution, task=task, template=template).run_datax_job
)
await session.commit()