init datamate

This commit is contained in:
Dallas98
2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions

View File

@@ -0,0 +1,2 @@

View File

@@ -0,0 +1,176 @@
# -*- coding: utf-8 -*-
import json
import time
import os
import uuid
from datetime import datetime
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from sqlalchemy import text
from datamate.sql_manager.sql_manager import SQLManager
class TaskInfoPersistence:
def __init__(self):
self.sql_dict = self.load_sql_dict()
@staticmethod
def load_sql_dict():
"""获取sql语句"""
sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
with open(sql_config_path, 'r', encoding='utf-8') as f:
return json.load(f)
def persistence_task_info(self, sample: Dict[str, Any]):
instance_id = str(sample.get("instance_id"))
src_file_name = str(sample.get("sourceFileName"))
src_file_type = str(sample.get("sourceFileType"))
src_file_id = str(sample.get("sourceFileId"))
src_file_size = int(sample.get("sourceFileSize"))
file_id = str(uuid.uuid4())
file_size = str(sample.get("fileSize"))
file_type = str(sample.get("fileType"))
file_name = str(sample.get("fileName"))
status = str(sample.get("execute_status"))
failed_reason = str(sample.get("failed_reason"))
result_data = {
"instance_id": instance_id,
"src_file_id": src_file_id,
"dest_file_id": file_id,
"src_name": src_file_name,
"dest_name": file_name,
"src_type": src_file_type,
"dest_type": file_type,
"src_size": src_file_size,
"dest_size": file_size,
"status": status,
"result": failed_reason
}
self.insert_result(result_data, str(self.sql_dict.get("insert_clean_result_sql")))
dataset_id = str(sample.get("dataset_id"))
file_path = str(sample.get("filePath"))
create_time = datetime.now()
last_access_time = datetime.fromtimestamp(os.path.getmtime(file_path))
file_data = {
"id": file_id,
"dataset_id": dataset_id,
"file_name": file_name,
"file_path": file_path,
"file_type": file_type,
"file_size": file_size,
"status": "COMPLETED",
"upload_time": create_time,
"last_access_time": last_access_time,
"created_at": create_time,
"updated_at": create_time
}
self.insert_result(file_data, str(self.sql_dict.get("insert_dataset_file_sql")))
@staticmethod
def insert_result(data, sql):
retries = 0
max_retries = 20
retry_delay = 1
while retries <= max_retries:
try:
with SQLManager.create_connect() as conn:
conn.execute(text(sql), data)
return
except Exception as e:
if "database is locked" in str(e) or "locking protocol" in str(e):
retries += 1
time.sleep(retry_delay)
else:
logger.error("database execute failed: {}", str(e))
raise RuntimeError(82000, str(e)) from None
raise Exception("Max retries exceeded")
def update_result(self, dataset_id, instance_id, status):
dataset_data = {
"dataset_id": dataset_id
}
query_dataset_sql = str(self.sql_dict.get("query_dataset_sql"))
with SQLManager.create_connect() as conn:
result = conn.execute(text(query_dataset_sql), dataset_data)
if result:
rows = result.fetchall()
total_size = sum(int(row[0]) for row in rows)
file_count = len(rows)
else:
total_size = 0
file_count = 0
dataset_data.update({
"task_id": instance_id,
"total_size": total_size,
"file_count": file_count
})
update_dataset_sql = str(self.sql_dict.get("update_dataset_sql"))
self.insert_result(dataset_data, update_dataset_sql)
task_data = {
"task_id": instance_id,
"status": status,
"total_size": total_size,
"finished_time": datetime.now()
}
update_task_sql = str(self.sql_dict.get("update_task_sql"))
self.insert_result(task_data, update_task_sql)
def query_task_info(self, instance_ids: list[str]):
result = {}
current_result = None
for instance_id in instance_ids:
try:
current_result = self.execute_sql_query(instance_id)
except Exception as e:
logger.warning("instance_id: {}, query job result error: {}", instance_id, str(e))
if current_result:
result[instance_id] = current_result
return result
def execute_sql_query(self, instance_id):
result = None
create_tables_sql = str(self.sql_dict.get("create_tables_sql"))
query_sql = str(self.sql_dict.get("query_sql"))
with SQLManager.create_connect() as conn:
conn.execute(text(create_tables_sql))
execute_result = conn.execute(text(query_sql), {"instance_id": instance_id})
result = execute_result.fetchall()
return result
# todo 删除接口待实现
def delete_task_info(self, instance_id: str):
create_tables_sql = self.sql_dict.get("create_tables_sql")
delete_task_instance_sql = self.sql_dict.get("delete_task_instance_sql")
try:
with SQLManager.create_connect() as conn:
conn.execute(text(create_tables_sql))
conn.execute(text(delete_task_instance_sql), {"instance_id": instance_id})
except Exception as e:
logger.warning(f"delete database for flow: {instance_id}", e)
def delete_task_operate_info(self, instance_id: str):
create_duplicate_img_tables_sql = self.sql_dict.get("create_duplicate_img_tables_sql")
create_similar_img_tables_sql = self.sql_dict.get("create_similar_img_tables_sql")
create_similar_text_tables_sql = self.sql_dict.get("create_similar_text_tables_sql")
delete_duplicate_img_tables_sql = self.sql_dict.get("delete_duplicate_img_tables_sql")
delete_similar_img_tables_sql = self.sql_dict.get("delete_similar_img_tables_sql")
delete_similar_text_tables_sql = self.sql_dict.get("delete_similar_text_tables_sql")
try:
with SQLManager.create_connect() as conn:
conn.execute(text(create_duplicate_img_tables_sql))
conn.execute(text(delete_duplicate_img_tables_sql), {"instance_id": instance_id})
conn.execute(text(create_similar_img_tables_sql))
conn.execute(text(delete_similar_img_tables_sql), {"instance_id": instance_id})
conn.execute(text(create_similar_text_tables_sql))
conn.execute(text(delete_similar_text_tables_sql), {"instance_id": instance_id})
except Exception as e:
logger.warning(f"delete database for flow: {instance_id} error", e)

View File

@@ -0,0 +1,17 @@
{
"query_sql": "SELECT * FROM t_task_instance_info WHERE instance_id IN (:instance_id)",
"insert_sql": "INSERT INTO t_task_instance_info (instance_id, meta_file_name, meta_file_type, meta_file_id, meta_file_size, file_id, file_size, file_type, file_name, file_path, status, operator_id, error_code, incremental, child_id, slice_num) VALUES (:instance_id, :meta_file_name, :meta_file_type, :meta_file_id, :meta_file_size, :file_id, :file_size, :file_type, :file_name, :file_path, :status, :operator_id, :error_code, :incremental, :child_id, :slice_num)",
"insert_dataset_file_sql": "INSERT INTO t_dm_dataset_files (id, dataset_id, file_name, file_path, file_type, file_size, status, upload_time, last_access_time, created_at, updated_at) VALUES (:id, :dataset_id, :file_name, :file_path, :file_type, :file_size, :status, :upload_time, :last_access_time, :created_at, :updated_at)",
"insert_clean_result_sql": "INSERT INTO t_clean_result (instance_id, src_file_id, dest_file_id, src_name, dest_name, src_type, dest_type, src_size, dest_size, status, result) VALUES (:instance_id, :src_file_id, :dest_file_id, :src_name, :dest_name, :src_type, :dest_type, :src_size, :dest_size, :status, :result)",
"query_dataset_sql": "SELECT file_size FROM t_dm_dataset_files WHERE dataset_id = :dataset_id",
"update_dataset_sql": "UPDATE t_dm_datasets SET size_bytes = :total_size, file_count = :file_count WHERE id = :dataset_id;",
"update_task_sql": "UPDATE t_clean_task SET status = :status, after_size = :total_size, finished_at = :finished_time WHERE id = :task_id",
"create_tables_sql": "CREATE TABLE IF NOT EXISTS t_task_instance_info (instance_id VARCHAR(255), meta_file_name TEXT, meta_file_type VARCHAR(100), meta_file_id BIGINT, meta_file_size VARCHAR(100), file_id BIGINT, file_size VARCHAR(100), file_type VARCHAR(100), file_name TEXT, file_path TEXT, status INT, operator_id VARCHAR(255), error_code VARCHAR(100), incremental VARCHAR(50), child_id BIGINT, slice_num INT DEFAULT 0);",
"delete_task_instance_sql": "DELETE FROM t_task_instance_info WHERE instance_id = :instance_id",
"create_duplicate_img_tables_sql": "CREATE TABLE IF NOT EXISTS operator_duplicate_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);",
"delete_duplicate_img_tables_sql": "DELETE FROM operator_duplicate_img_features WHERE flow_id = :flow_id",
"create_similar_img_tables_sql": "CREATE TABLE IF NOT EXISTS operator_similar_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),p_hash TEXT,des_matrix BLOB,matrix_shape TEXT,file_name TEXT,timestamp DATETIME);",
"delete_similar_img_tables_sql": "DELETE FROM operator_similar_img_features WHERE flow_id = :flow_id",
"create_similar_text_tables_sql": "CREATE TABLE IF NOT EXISTS operators_similar_text_features (id INT AUTO_INCREMENT PRIMARY KEY, task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);",
"delete_similar_text_tables_sql": "DELETE FROM operators_similar_text_features WHERE flow_id = :flow_id"
}

View File

@@ -0,0 +1,52 @@
# -- encoding: utf-8 --
import os
import time
from random import uniform
from loguru import logger
from sqlalchemy import create_engine, inspect
from sqlalchemy.engine import URL
class SQLManager:
@staticmethod
def create_connect(max_retries=5, base_delay=1):
"""
连接到 MySQL 数据库,使用 SQLAlchemy 和 PyMySQL。
:param max_retries: 最大重试次数
:param base_delay: 基础时延
:return: 返回 SQLAlchemy 连接对象
"""
connection_url = URL.create(
drivername="mysql+pymysql",
username=os.getenv("MYSQL_USER", "root"),
password=os.getenv("MYSQL_PASSWORD", "Huawei@123"),
host=os.getenv("MYSQL_HOST", "mysql"),
port=os.getenv("MYSQL_PORT", 3306),
database=os.getenv("MYSQL_DATABASE", "datamate"),
query={"charset": "utf8mb4"},
)
attempt = 0
while True:
try:
engine = create_engine(connection_url, pool_pre_ping=True, isolation_level="AUTOCOMMIT")
return engine.connect()
except Exception as e:
logger.error(f"Attempt {attempt + 1} failed with error: {str(e)}")
if attempt >= max_retries - 1:
raise
wait_time = min(30, base_delay * (2 ** attempt)) # 不超过30秒的最大延时
jitter = uniform(-wait_time / 4, wait_time / 4) # 增加随机抖动因子
time.sleep(wait_time + jitter)
attempt += 1
if __name__ == "__main__":
with SQLManager.create_connect() as connection:
inspector = inspect(connection)
print(inspector.get_table_names())