You've already forked DataMate
init datamate
This commit is contained in:
2
runtime/python-executor/datamate/sql_manager/__init__.py
Normal file
2
runtime/python-executor/datamate/sql_manager/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
|
||||
|
||||
@@ -0,0 +1,176 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
from sqlalchemy import text
|
||||
|
||||
from datamate.sql_manager.sql_manager import SQLManager
|
||||
|
||||
|
||||
class TaskInfoPersistence:
|
||||
def __init__(self):
|
||||
self.sql_dict = self.load_sql_dict()
|
||||
|
||||
@staticmethod
|
||||
def load_sql_dict():
|
||||
"""获取sql语句"""
|
||||
sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
|
||||
with open(sql_config_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
def persistence_task_info(self, sample: Dict[str, Any]):
|
||||
instance_id = str(sample.get("instance_id"))
|
||||
src_file_name = str(sample.get("sourceFileName"))
|
||||
src_file_type = str(sample.get("sourceFileType"))
|
||||
src_file_id = str(sample.get("sourceFileId"))
|
||||
src_file_size = int(sample.get("sourceFileSize"))
|
||||
file_id = str(uuid.uuid4())
|
||||
file_size = str(sample.get("fileSize"))
|
||||
file_type = str(sample.get("fileType"))
|
||||
file_name = str(sample.get("fileName"))
|
||||
|
||||
status = str(sample.get("execute_status"))
|
||||
failed_reason = str(sample.get("failed_reason"))
|
||||
result_data = {
|
||||
"instance_id": instance_id,
|
||||
"src_file_id": src_file_id,
|
||||
"dest_file_id": file_id,
|
||||
"src_name": src_file_name,
|
||||
"dest_name": file_name,
|
||||
"src_type": src_file_type,
|
||||
"dest_type": file_type,
|
||||
"src_size": src_file_size,
|
||||
"dest_size": file_size,
|
||||
"status": status,
|
||||
"result": failed_reason
|
||||
}
|
||||
self.insert_result(result_data, str(self.sql_dict.get("insert_clean_result_sql")))
|
||||
|
||||
dataset_id = str(sample.get("dataset_id"))
|
||||
file_path = str(sample.get("filePath"))
|
||||
create_time = datetime.now()
|
||||
last_access_time = datetime.fromtimestamp(os.path.getmtime(file_path))
|
||||
file_data = {
|
||||
"id": file_id,
|
||||
"dataset_id": dataset_id,
|
||||
"file_name": file_name,
|
||||
"file_path": file_path,
|
||||
"file_type": file_type,
|
||||
"file_size": file_size,
|
||||
"status": "COMPLETED",
|
||||
"upload_time": create_time,
|
||||
"last_access_time": last_access_time,
|
||||
"created_at": create_time,
|
||||
"updated_at": create_time
|
||||
}
|
||||
self.insert_result(file_data, str(self.sql_dict.get("insert_dataset_file_sql")))
|
||||
|
||||
@staticmethod
|
||||
def insert_result(data, sql):
|
||||
retries = 0
|
||||
max_retries = 20
|
||||
retry_delay = 1
|
||||
while retries <= max_retries:
|
||||
try:
|
||||
with SQLManager.create_connect() as conn:
|
||||
conn.execute(text(sql), data)
|
||||
return
|
||||
except Exception as e:
|
||||
if "database is locked" in str(e) or "locking protocol" in str(e):
|
||||
retries += 1
|
||||
time.sleep(retry_delay)
|
||||
else:
|
||||
logger.error("database execute failed: {}", str(e))
|
||||
raise RuntimeError(82000, str(e)) from None
|
||||
raise Exception("Max retries exceeded")
|
||||
|
||||
def update_result(self, dataset_id, instance_id, status):
|
||||
dataset_data = {
|
||||
"dataset_id": dataset_id
|
||||
}
|
||||
query_dataset_sql = str(self.sql_dict.get("query_dataset_sql"))
|
||||
with SQLManager.create_connect() as conn:
|
||||
result = conn.execute(text(query_dataset_sql), dataset_data)
|
||||
if result:
|
||||
rows = result.fetchall()
|
||||
total_size = sum(int(row[0]) for row in rows)
|
||||
file_count = len(rows)
|
||||
else:
|
||||
total_size = 0
|
||||
file_count = 0
|
||||
|
||||
dataset_data.update({
|
||||
"task_id": instance_id,
|
||||
"total_size": total_size,
|
||||
"file_count": file_count
|
||||
})
|
||||
|
||||
update_dataset_sql = str(self.sql_dict.get("update_dataset_sql"))
|
||||
self.insert_result(dataset_data, update_dataset_sql)
|
||||
|
||||
task_data = {
|
||||
"task_id": instance_id,
|
||||
"status": status,
|
||||
"total_size": total_size,
|
||||
"finished_time": datetime.now()
|
||||
}
|
||||
update_task_sql = str(self.sql_dict.get("update_task_sql"))
|
||||
self.insert_result(task_data, update_task_sql)
|
||||
|
||||
def query_task_info(self, instance_ids: list[str]):
|
||||
result = {}
|
||||
current_result = None
|
||||
for instance_id in instance_ids:
|
||||
try:
|
||||
current_result = self.execute_sql_query(instance_id)
|
||||
except Exception as e:
|
||||
logger.warning("instance_id: {}, query job result error: {}", instance_id, str(e))
|
||||
if current_result:
|
||||
result[instance_id] = current_result
|
||||
return result
|
||||
|
||||
def execute_sql_query(self, instance_id):
|
||||
result = None
|
||||
create_tables_sql = str(self.sql_dict.get("create_tables_sql"))
|
||||
query_sql = str(self.sql_dict.get("query_sql"))
|
||||
with SQLManager.create_connect() as conn:
|
||||
conn.execute(text(create_tables_sql))
|
||||
execute_result = conn.execute(text(query_sql), {"instance_id": instance_id})
|
||||
result = execute_result.fetchall()
|
||||
return result
|
||||
|
||||
# todo 删除接口待实现
|
||||
def delete_task_info(self, instance_id: str):
|
||||
create_tables_sql = self.sql_dict.get("create_tables_sql")
|
||||
delete_task_instance_sql = self.sql_dict.get("delete_task_instance_sql")
|
||||
try:
|
||||
with SQLManager.create_connect() as conn:
|
||||
conn.execute(text(create_tables_sql))
|
||||
conn.execute(text(delete_task_instance_sql), {"instance_id": instance_id})
|
||||
except Exception as e:
|
||||
logger.warning(f"delete database for flow: {instance_id}", e)
|
||||
|
||||
def delete_task_operate_info(self, instance_id: str):
|
||||
create_duplicate_img_tables_sql = self.sql_dict.get("create_duplicate_img_tables_sql")
|
||||
create_similar_img_tables_sql = self.sql_dict.get("create_similar_img_tables_sql")
|
||||
create_similar_text_tables_sql = self.sql_dict.get("create_similar_text_tables_sql")
|
||||
delete_duplicate_img_tables_sql = self.sql_dict.get("delete_duplicate_img_tables_sql")
|
||||
delete_similar_img_tables_sql = self.sql_dict.get("delete_similar_img_tables_sql")
|
||||
delete_similar_text_tables_sql = self.sql_dict.get("delete_similar_text_tables_sql")
|
||||
try:
|
||||
with SQLManager.create_connect() as conn:
|
||||
conn.execute(text(create_duplicate_img_tables_sql))
|
||||
conn.execute(text(delete_duplicate_img_tables_sql), {"instance_id": instance_id})
|
||||
conn.execute(text(create_similar_img_tables_sql))
|
||||
conn.execute(text(delete_similar_img_tables_sql), {"instance_id": instance_id})
|
||||
conn.execute(text(create_similar_text_tables_sql))
|
||||
conn.execute(text(delete_similar_text_tables_sql), {"instance_id": instance_id})
|
||||
except Exception as e:
|
||||
logger.warning(f"delete database for flow: {instance_id} error", e)
|
||||
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"query_sql": "SELECT * FROM t_task_instance_info WHERE instance_id IN (:instance_id)",
|
||||
"insert_sql": "INSERT INTO t_task_instance_info (instance_id, meta_file_name, meta_file_type, meta_file_id, meta_file_size, file_id, file_size, file_type, file_name, file_path, status, operator_id, error_code, incremental, child_id, slice_num) VALUES (:instance_id, :meta_file_name, :meta_file_type, :meta_file_id, :meta_file_size, :file_id, :file_size, :file_type, :file_name, :file_path, :status, :operator_id, :error_code, :incremental, :child_id, :slice_num)",
|
||||
"insert_dataset_file_sql": "INSERT INTO t_dm_dataset_files (id, dataset_id, file_name, file_path, file_type, file_size, status, upload_time, last_access_time, created_at, updated_at) VALUES (:id, :dataset_id, :file_name, :file_path, :file_type, :file_size, :status, :upload_time, :last_access_time, :created_at, :updated_at)",
|
||||
"insert_clean_result_sql": "INSERT INTO t_clean_result (instance_id, src_file_id, dest_file_id, src_name, dest_name, src_type, dest_type, src_size, dest_size, status, result) VALUES (:instance_id, :src_file_id, :dest_file_id, :src_name, :dest_name, :src_type, :dest_type, :src_size, :dest_size, :status, :result)",
|
||||
"query_dataset_sql": "SELECT file_size FROM t_dm_dataset_files WHERE dataset_id = :dataset_id",
|
||||
"update_dataset_sql": "UPDATE t_dm_datasets SET size_bytes = :total_size, file_count = :file_count WHERE id = :dataset_id;",
|
||||
"update_task_sql": "UPDATE t_clean_task SET status = :status, after_size = :total_size, finished_at = :finished_time WHERE id = :task_id",
|
||||
"create_tables_sql": "CREATE TABLE IF NOT EXISTS t_task_instance_info (instance_id VARCHAR(255), meta_file_name TEXT, meta_file_type VARCHAR(100), meta_file_id BIGINT, meta_file_size VARCHAR(100), file_id BIGINT, file_size VARCHAR(100), file_type VARCHAR(100), file_name TEXT, file_path TEXT, status INT, operator_id VARCHAR(255), error_code VARCHAR(100), incremental VARCHAR(50), child_id BIGINT, slice_num INT DEFAULT 0);",
|
||||
"delete_task_instance_sql": "DELETE FROM t_task_instance_info WHERE instance_id = :instance_id",
|
||||
"create_duplicate_img_tables_sql": "CREATE TABLE IF NOT EXISTS operator_duplicate_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);",
|
||||
"delete_duplicate_img_tables_sql": "DELETE FROM operator_duplicate_img_features WHERE flow_id = :flow_id",
|
||||
"create_similar_img_tables_sql": "CREATE TABLE IF NOT EXISTS operator_similar_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),p_hash TEXT,des_matrix BLOB,matrix_shape TEXT,file_name TEXT,timestamp DATETIME);",
|
||||
"delete_similar_img_tables_sql": "DELETE FROM operator_similar_img_features WHERE flow_id = :flow_id",
|
||||
"create_similar_text_tables_sql": "CREATE TABLE IF NOT EXISTS operators_similar_text_features (id INT AUTO_INCREMENT PRIMARY KEY, task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);",
|
||||
"delete_similar_text_tables_sql": "DELETE FROM operators_similar_text_features WHERE flow_id = :flow_id"
|
||||
}
|
||||
52
runtime/python-executor/datamate/sql_manager/sql_manager.py
Normal file
52
runtime/python-executor/datamate/sql_manager/sql_manager.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# -- encoding: utf-8 --
|
||||
import os
|
||||
import time
|
||||
from random import uniform
|
||||
|
||||
from loguru import logger
|
||||
from sqlalchemy import create_engine, inspect
|
||||
from sqlalchemy.engine import URL
|
||||
|
||||
|
||||
class SQLManager:
|
||||
|
||||
@staticmethod
|
||||
def create_connect(max_retries=5, base_delay=1):
|
||||
"""
|
||||
连接到 MySQL 数据库,使用 SQLAlchemy 和 PyMySQL。
|
||||
:param max_retries: 最大重试次数
|
||||
:param base_delay: 基础时延
|
||||
:return: 返回 SQLAlchemy 连接对象
|
||||
"""
|
||||
|
||||
connection_url = URL.create(
|
||||
drivername="mysql+pymysql",
|
||||
username=os.getenv("MYSQL_USER", "root"),
|
||||
password=os.getenv("MYSQL_PASSWORD", "Huawei@123"),
|
||||
host=os.getenv("MYSQL_HOST", "mysql"),
|
||||
port=os.getenv("MYSQL_PORT", 3306),
|
||||
database=os.getenv("MYSQL_DATABASE", "datamate"),
|
||||
query={"charset": "utf8mb4"},
|
||||
)
|
||||
|
||||
attempt = 0
|
||||
|
||||
while True:
|
||||
try:
|
||||
engine = create_engine(connection_url, pool_pre_ping=True, isolation_level="AUTOCOMMIT")
|
||||
return engine.connect()
|
||||
except Exception as e:
|
||||
logger.error(f"Attempt {attempt + 1} failed with error: {str(e)}")
|
||||
if attempt >= max_retries - 1:
|
||||
raise
|
||||
wait_time = min(30, base_delay * (2 ** attempt)) # 不超过30秒的最大延时
|
||||
jitter = uniform(-wait_time / 4, wait_time / 4) # 增加随机抖动因子
|
||||
time.sleep(wait_time + jitter)
|
||||
attempt += 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with SQLManager.create_connect() as connection:
|
||||
inspector = inspect(connection)
|
||||
print(inspector.get_table_names())
|
||||
|
||||
Reference in New Issue
Block a user