You've already forked DataMate
优化部分问题 (#126)
* feature: 支持相对路径引用 * feature: 优化本地部署命令 * feature: 优化算子编排展示 * feature: 优化清洗任务失败后重试
This commit is contained in:
@@ -70,15 +70,15 @@ make install-mineru
|
|||||||
```
|
```
|
||||||
|
|
||||||
### 部署DeerFlow服务
|
### 部署DeerFlow服务
|
||||||
1. 修改runtime/deer-flow/.env.example,添加SEARCH_API_KEY和EMBEDDING模型配置
|
```bash
|
||||||
2. 修改runtime/deer-flow/.conf.yaml.example,添加基础模型服务配置
|
make install-deer-flow
|
||||||
3. 执行`make install-deer-flow`
|
```
|
||||||
|
|
||||||
### 本地开发部署
|
### 本地开发部署
|
||||||
本地代码修改后,请执行以下命令构建镜像并使用本地镜像部署
|
本地代码修改后,请执行以下命令构建镜像并使用本地镜像部署
|
||||||
```bash
|
```bash
|
||||||
make build
|
make build
|
||||||
make install REGISTRY=""
|
make install dev=true
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🤝 贡献指南
|
## 🤝 贡献指南
|
||||||
|
|||||||
@@ -171,10 +171,10 @@ public class CleaningTaskService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void executeTask(String taskId) {
|
public void executeTask(String taskId) {
|
||||||
List<CleaningResultDto> failed = cleaningResultRepo.findByInstanceId(taskId, "FAILED");
|
List<CleaningResultDto> succeed = cleaningResultRepo.findByInstanceId(taskId, "COMPLETED");
|
||||||
Set<String> failedSet = failed.stream().map(CleaningResultDto::getSrcFileId).collect(Collectors.toSet());
|
Set<String> succeedSet = succeed.stream().map(CleaningResultDto::getSrcFileId).collect(Collectors.toSet());
|
||||||
CleaningTaskDto task = cleaningTaskRepo.findTaskById(taskId);
|
CleaningTaskDto task = cleaningTaskRepo.findTaskById(taskId);
|
||||||
scanDataset(taskId, task.getSrcDatasetId(), failedSet);
|
scanDataset(taskId, task.getSrcDatasetId(), succeedSet);
|
||||||
cleaningResultRepo.deleteByInstanceId(taskId, "FAILED");
|
cleaningResultRepo.deleteByInstanceId(taskId, "FAILED");
|
||||||
taskScheduler.executeTask(taskId);
|
taskScheduler.executeTask(taskId);
|
||||||
}
|
}
|
||||||
@@ -232,7 +232,7 @@ public class CleaningTaskService {
|
|||||||
} while (pageNumber < datasetFiles.getTotalPages());
|
} while (pageNumber < datasetFiles.getTotalPages());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void scanDataset(String taskId, String srcDatasetId, Set<String> failedFiles) {
|
private void scanDataset(String taskId, String srcDatasetId, Set<String> succeedFiles) {
|
||||||
int pageNumber = 0;
|
int pageNumber = 0;
|
||||||
int pageSize = 500;
|
int pageSize = 500;
|
||||||
PagingQuery pageRequest = new PagingQuery(pageNumber, pageSize);
|
PagingQuery pageRequest = new PagingQuery(pageNumber, pageSize);
|
||||||
@@ -243,7 +243,7 @@ public class CleaningTaskService {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
List<Map<String, Object>> files = datasetFiles.getContent().stream()
|
List<Map<String, Object>> files = datasetFiles.getContent().stream()
|
||||||
.filter(content -> failedFiles.contains(content.getId()))
|
.filter(content -> !succeedFiles.contains(content.getId()))
|
||||||
.map(content -> Map.of("fileName", (Object) content.getFileName(),
|
.map(content -> Map.of("fileName", (Object) content.getFileName(),
|
||||||
"fileSize", content.getFileSize(),
|
"fileSize", content.getFileSize(),
|
||||||
"filePath", content.getFilePath(),
|
"filePath", content.getFilePath(),
|
||||||
|
|||||||
@@ -182,14 +182,6 @@ const OperatorFlow: React.FC<OperatorFlowProps> = ({
|
|||||||
{operator?.categories?.map((categoryId) => {
|
{operator?.categories?.map((categoryId) => {
|
||||||
return <Tag color="default">{categoryMap[categoryId].name}</Tag>
|
return <Tag color="default">{categoryMap[categoryId].name}</Tag>
|
||||||
})}
|
})}
|
||||||
{/* 参数状态指示 */}
|
|
||||||
{Object.values(operator.configs).some(
|
|
||||||
(param: any) =>
|
|
||||||
(param.type === "input" && !param.value) ||
|
|
||||||
(param.type === "checkbox" &&
|
|
||||||
Array.isArray(param.value) &&
|
|
||||||
param.value.length === 0)
|
|
||||||
) && <Tag color="red">待配置</Tag>}
|
|
||||||
{/* 操作按钮 */}
|
{/* 操作按钮 */}
|
||||||
<span
|
<span
|
||||||
className="cursor-pointer text-red-500"
|
className="cursor-pointer text-red-500"
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ from datamate.core.base_op import Filter, Mapper, Slicer
|
|||||||
from datamate.core.constant import Fields
|
from datamate.core.constant import Fields
|
||||||
from datamate.core.base_op import OPERATORS, BaseOp
|
from datamate.core.base_op import OPERATORS, BaseOp
|
||||||
|
|
||||||
|
from core.base_op import Filter as RELATIVE_Filter, Mapper as RELATIVE_Mapper, Slicer as RELATIVE_Slicer
|
||||||
|
|
||||||
rd.DataContext.get_current().enable_progress_bars = False
|
rd.DataContext.get_current().enable_progress_bars = False
|
||||||
|
|
||||||
|
|
||||||
@@ -136,7 +138,10 @@ class RayDataset(BasicDataset):
|
|||||||
parent_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ops")
|
parent_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ops")
|
||||||
if parent_dir not in sys.path:
|
if parent_dir not in sys.path:
|
||||||
sys.path.insert(0, parent_dir)
|
sys.path.insert(0, parent_dir)
|
||||||
registry_content = OPERATORS.modules[op_name]
|
registry_content = OPERATORS.modules.get(op_name)
|
||||||
|
if registry_content is None:
|
||||||
|
from core.base_op import OPERATORS as RELATIVE_OPERATORS
|
||||||
|
registry_content = RELATIVE_OPERATORS.modules.get(op_name)
|
||||||
if isinstance(registry_content, str):
|
if isinstance(registry_content, str):
|
||||||
# registry_content是module的路径
|
# registry_content是module的路径
|
||||||
submodule = importlib.import_module(registry_content)
|
submodule = importlib.import_module(registry_content)
|
||||||
@@ -171,7 +176,7 @@ class RayDataset(BasicDataset):
|
|||||||
|
|
||||||
kwargs.update({"ext_params": {}, "failed_reason": {}, "target_type": None})
|
kwargs.update({"ext_params": {}, "failed_reason": {}, "target_type": None})
|
||||||
try:
|
try:
|
||||||
if issubclass(operators_cls, Mapper):
|
if issubclass(operators_cls, (Mapper, RELATIVE_Mapper)):
|
||||||
self.data = self.data.map(operators_cls,
|
self.data = self.data.map(operators_cls,
|
||||||
fn_constructor_kwargs=init_kwargs,
|
fn_constructor_kwargs=init_kwargs,
|
||||||
fn_kwargs=kwargs,
|
fn_kwargs=kwargs,
|
||||||
@@ -179,7 +184,7 @@ class RayDataset(BasicDataset):
|
|||||||
num_cpus=0.05,
|
num_cpus=0.05,
|
||||||
concurrency=(1, 1 if operators_cls.use_model else int(max_actor_nums)))
|
concurrency=(1, 1 if operators_cls.use_model else int(max_actor_nums)))
|
||||||
|
|
||||||
elif issubclass(operators_cls, Slicer):
|
elif issubclass(operators_cls, (Slicer, RELATIVE_Slicer)):
|
||||||
self.data = self.data.flat_map(operators_cls,
|
self.data = self.data.flat_map(operators_cls,
|
||||||
fn_constructor_kwargs=init_kwargs,
|
fn_constructor_kwargs=init_kwargs,
|
||||||
fn_kwargs=kwargs,
|
fn_kwargs=kwargs,
|
||||||
@@ -187,7 +192,7 @@ class RayDataset(BasicDataset):
|
|||||||
num_cpus=0.05,
|
num_cpus=0.05,
|
||||||
concurrency=(1, int(max_actor_nums)))
|
concurrency=(1, int(max_actor_nums)))
|
||||||
|
|
||||||
elif issubclass(operators_cls, Filter):
|
elif issubclass(operators_cls, (Filter, RELATIVE_Filter)):
|
||||||
self.data = self.data.filter(operators_cls,
|
self.data = self.data.filter(operators_cls,
|
||||||
fn_constructor_kwargs=init_kwargs,
|
fn_constructor_kwargs=init_kwargs,
|
||||||
fn_kwargs=kwargs,
|
fn_kwargs=kwargs,
|
||||||
|
|||||||
@@ -26,4 +26,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
# Run the application.
|
# Run the application.
|
||||||
CMD ["uv", "run", "python", "server.py", "--host", "0.0.0.0", "--port", "8000"]
|
CMD ["uv", "run", "--no-sync", "python", "server.py", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
|
|||||||
Reference in New Issue
Block a user