You've already forked DataMate
feature: add mysql collection and starrocks collection (#222)
* fix: fix the path for backend-python imaage building * feature: add mysql collection and starrocks collection * feature: add mysql collection and starrocks collection * fix: change the permission of those files which collected from nfs to 754 * fix: delete collected files, config files and log files while deleting collection task * fix: add the collection task detail api * fix: change the log of collecting for dataset * fix: add collection task selecting while creating and updating dataset * fix: set the umask value to 0022 for java process
This commit is contained in:
@@ -13,9 +13,10 @@ from app.module.shared.schema import TaskStatus
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class DataxClient:
|
||||
def __init__(self, task: CollectionTask, execution: TaskExecution):
|
||||
def __init__(self, task: CollectionTask, execution: TaskExecution, template: CollectionTemplate):
|
||||
self.execution = execution
|
||||
self.task = task
|
||||
self.template = template
|
||||
self.config_file_path = f"/flow/data-collection/{task.id}/config.json"
|
||||
self.python_path = "python"
|
||||
self.datax_main = "/opt/datax/bin/datax.py"
|
||||
@@ -53,10 +54,21 @@ class DataxClient:
|
||||
**(task_config.parameter if task_config.parameter else {}),
|
||||
**(task_config.reader if task_config.reader else {})
|
||||
}
|
||||
dest_parameter = {}
|
||||
if template.target_type == "txtfilewriter":
|
||||
dest_parameter = {
|
||||
"path": target_path,
|
||||
"fileName": "collection_result",
|
||||
"writeMode": "truncate"
|
||||
}
|
||||
elif template.target_type == "nfswriter" or template.target_type == "obswriter":
|
||||
dest_parameter = {
|
||||
"destPath": target_path
|
||||
}
|
||||
writer_parameter = {
|
||||
**(task_config.parameter if task_config.parameter else {}),
|
||||
**(task_config.writer if task_config.writer else {}),
|
||||
"destPath": target_path
|
||||
**dest_parameter
|
||||
}
|
||||
# 生成任务运行配置
|
||||
job_config = {
|
||||
@@ -128,6 +140,7 @@ class DataxClient:
|
||||
logger.info(f"DataX 任务执行成功: {self.execution.id}")
|
||||
logger.info(f"执行耗时: {self.execution.duration_seconds:.2f} 秒")
|
||||
self.execution.status = TaskStatus.COMPLETED.name
|
||||
self.rename_collection_result()
|
||||
else:
|
||||
self.execution.error_message = self.execution.error_message or f"DataX 任务执行失败,退出码: {exit_code}"
|
||||
self.execution.status = TaskStatus.FAILED.name
|
||||
@@ -141,6 +154,23 @@ class DataxClient:
|
||||
if self.task.sync_mode == SyncMode.ONCE:
|
||||
self.task.status = self.execution.status
|
||||
|
||||
def rename_collection_result(self):
|
||||
if self.template.target_type != "txtfilewriter":
|
||||
return
|
||||
target_path = Path(self.task.target_path)
|
||||
if not target_path.exists():
|
||||
logger.warning(f"Target path does not exist: {target_path}")
|
||||
return
|
||||
# If it's a directory, find all files without extensions
|
||||
for file_path in target_path.iterdir():
|
||||
if file_path.is_file() and not file_path.suffix:
|
||||
new_path = file_path.with_suffix('.csv')
|
||||
try:
|
||||
file_path.rename(new_path)
|
||||
logger.info(f"Renamed {file_path} to {new_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to rename {file_path} to {new_path}: {str(e)}")
|
||||
|
||||
def _run_process(self, cmd: list[str], log_f) -> int:
|
||||
# 启动进程
|
||||
process = subprocess.Popen(
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import math
|
||||
import uuid
|
||||
import shutil
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
@@ -138,6 +140,13 @@ async def delete_collection_tasks(
|
||||
.where(TaskExecution.task_id == task_id)
|
||||
)
|
||||
|
||||
target_path = f"/dataset/local/{task_id}"
|
||||
if os.path.exists(target_path):
|
||||
shutil.rmtree(target_path)
|
||||
job_path = f"/flow/data-collection/{task_id}"
|
||||
if os.path.exists(job_path):
|
||||
shutil.rmtree(job_path)
|
||||
|
||||
# 删除任务
|
||||
await db.delete(task)
|
||||
await db.commit()
|
||||
@@ -155,3 +164,29 @@ async def delete_collection_tasks(
|
||||
await db.rollback()
|
||||
logger.error(f"Failed to delete collection task: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail="Internal server error")
|
||||
|
||||
@router.get("/{task_id}", response_model=StandardResponse[CollectionTaskBase])
|
||||
async def get_task(
|
||||
task_id: str,
|
||||
db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
"""获取归集任务详情"""
|
||||
try:
|
||||
# Query the task by ID
|
||||
task = await db.get(CollectionTask, task_id)
|
||||
if not task:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Task with ID {task_id} not found"
|
||||
)
|
||||
|
||||
return StandardResponse(
|
||||
code=200,
|
||||
message="Success",
|
||||
data=converter_to_response(task)
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get task {task_id}: {str(e)}", e)
|
||||
raise HTTPException(status_code=500, detail="Internal server error")
|
||||
|
||||
@@ -58,6 +58,7 @@ class CollectionTaskService:
|
||||
logger.error(f"task {task_id} not exist")
|
||||
return
|
||||
template = await session.execute(select(CollectionTemplate).where(CollectionTemplate.id == task.template_id))
|
||||
template = template.scalar_one_or_none()
|
||||
if not template:
|
||||
logger.error(f"template {task.template_name} not exist")
|
||||
return
|
||||
@@ -65,6 +66,6 @@ class CollectionTaskService:
|
||||
session.add(task_execution)
|
||||
await session.commit()
|
||||
await asyncio.to_thread(
|
||||
DataxClient(execution=task_execution, task=task).run_datax_job
|
||||
DataxClient(execution=task_execution, task=task, template=template).run_datax_job
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
Reference in New Issue
Block a user