You've already forked DataMate
init datamate
This commit is contained in:
80
runtime/datax/nfsreader/pom.xml
Normal file
80
runtime/datax/nfsreader/pom.xml
Normal file
@@ -0,0 +1,80 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>datax-all</artifactId>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>nfsreader</artifactId>
|
||||
<name>nfsreader</name>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-core</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-common</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>ch.qos.logback</groupId>
|
||||
<artifactId>logback-classic</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/java</directory>
|
||||
<includes>
|
||||
<include>**/*.properties</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
<plugins>
|
||||
<!-- compiler plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>${jdk-version}</source>
|
||||
<target>${jdk-version}</target>
|
||||
<encoding>${project-sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<!-- assembly plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<configuration>
|
||||
<descriptors>
|
||||
<descriptor>src/main/assembly/package.xml</descriptor>
|
||||
</descriptors>
|
||||
<finalName>datax</finalName>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>dwzip</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
</project>
|
||||
35
runtime/datax/nfsreader/src/main/assembly/package.xml
Normal file
35
runtime/datax/nfsreader/src/main/assembly/package.xml
Normal file
@@ -0,0 +1,35 @@
|
||||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
<id></id>
|
||||
<formats>
|
||||
<format>dir</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<directory>src/main/resources</directory>
|
||||
<includes>
|
||||
<include>plugin.json</include>
|
||||
<include>plugin_job_template.json</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/reader/nfsreader</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>target/</directory>
|
||||
<includes>
|
||||
<include>nfsreader-0.0.1-SNAPSHOT.jar</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/reader/nfsreader</outputDirectory>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
<useProjectArtifact>false</useProjectArtifact>
|
||||
<outputDirectory>plugin/reader/nfsreader/libs</outputDirectory>
|
||||
<scope>runtime</scope>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
||||
@@ -0,0 +1,121 @@
|
||||
package com.modelengine.edatamate.plugin.reader.nfsreader;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.DirectoryNotEmptyException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 一个简单的 Linux NAS 挂载工具类
|
||||
* 仅适用于 Linux,需具备 sudo 权限或 root。
|
||||
*/
|
||||
public final class MountUtil {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MountUtil.class);
|
||||
|
||||
private MountUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 挂载远程目录
|
||||
*
|
||||
* @param remote 远程地址,如 192.168.1.1:/test
|
||||
* @param mountPoint 本地挂载点,如 /mnt/nas
|
||||
* @param type 文件系统类型:nfs、cifs ...
|
||||
* @param options 额外挂载参数,如 ro,vers=3 或 username=xxx,password=xxx
|
||||
*/
|
||||
public static void mount(String remote, String mountPoint, String type, String options) {
|
||||
try {
|
||||
Path mp = Paths.get(mountPoint);
|
||||
if (isMounted(mountPoint)) {
|
||||
throw new IOException("Already mounted: " + mountPoint);
|
||||
}
|
||||
|
||||
Files.createDirectories(mp);
|
||||
|
||||
ProcessBuilder pb = new ProcessBuilder();
|
||||
if (options == null || options.isEmpty()) {
|
||||
pb.command("mount", "-t", type, remote, mountPoint);
|
||||
} else {
|
||||
pb.command("mount", "-t", type, "-o", options, remote, mountPoint);
|
||||
}
|
||||
LOG.info(pb.command().toString());
|
||||
pb.redirectErrorStream(true);
|
||||
Process p = pb.start();
|
||||
StringBuilder output = new StringBuilder();
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream()))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
output.append(line).append(System.lineSeparator());
|
||||
}
|
||||
}
|
||||
int rc = p.waitFor();
|
||||
if (rc != 0) {
|
||||
throw new RuntimeException("Mount failed, exit=" + rc + ", output: " + output);
|
||||
}
|
||||
} catch (IOException | InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 卸载挂载点
|
||||
*
|
||||
* @param mountPoint 挂载点路径
|
||||
* @throws IOException 卸载失败
|
||||
* @throws InterruptedException 进程等待中断
|
||||
*/
|
||||
public static void umount(String mountPoint) throws IOException, InterruptedException {
|
||||
if (!isMounted(mountPoint)) {
|
||||
return;
|
||||
}
|
||||
|
||||
ProcessBuilder pb = new ProcessBuilder("umount", "-l", mountPoint);
|
||||
pb.redirectErrorStream(true);
|
||||
Process p = pb.start();
|
||||
StringBuilder output = new StringBuilder();
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream()))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
output.append(line).append(System.lineSeparator());
|
||||
}
|
||||
}
|
||||
int rc = p.waitFor();
|
||||
if (rc != 0) {
|
||||
throw new RuntimeException("Mount failed, exit=" + rc + ", output: " + output);
|
||||
}
|
||||
|
||||
// 清理空目录
|
||||
try {
|
||||
Files.deleteIfExists(Paths.get(mountPoint));
|
||||
} catch (DirectoryNotEmptyException ignore) {
|
||||
// 目录非空,保留
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断挂载点是否已挂载
|
||||
*
|
||||
* @param mountPoint 挂载点路径
|
||||
* @return true 表示已挂载
|
||||
* @throws IOException 读取 /proc/mounts 失败
|
||||
*/
|
||||
public static boolean isMounted(String mountPoint) throws IOException {
|
||||
Path procMounts = Paths.get("/proc/mounts");
|
||||
if (!Files.exists(procMounts)) {
|
||||
throw new IOException("/proc/mounts not found");
|
||||
}
|
||||
String expected = mountPoint.trim();
|
||||
List<String> lines = Files.readAllLines(procMounts);
|
||||
return lines.stream()
|
||||
.map(l -> l.split("\\s+"))
|
||||
.filter(a -> a.length >= 2)
|
||||
.anyMatch(a -> a[1].equals(expected));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
package com.modelengine.edatamate.plugin.reader.nfsreader;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.alibaba.datax.common.element.Record;
|
||||
import com.alibaba.datax.common.element.StringColumn;
|
||||
import com.alibaba.datax.common.plugin.RecordSender;
|
||||
import com.alibaba.datax.common.spi.Reader;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class NfsReader extends Reader {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(NfsReader.class);
|
||||
|
||||
public static class Job extends Reader.Job {
|
||||
private Configuration jobConfig = null;
|
||||
private String mountPoint;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.jobConfig = super.getPluginJobConf();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepare() {
|
||||
this.mountPoint = "/dataset/mount/" + UUID.randomUUID();
|
||||
this.jobConfig.set("mountPoint", this.mountPoint);
|
||||
MountUtil.mount(this.jobConfig.getString("ip") + ":" + this.jobConfig.getString("path"),
|
||||
mountPoint, "nfs", StringUtils.EMPTY);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Configuration> split(int adviceNumber) {
|
||||
return Collections.singletonList(this.jobConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void post() {
|
||||
try {
|
||||
MountUtil.umount(this.mountPoint);
|
||||
new File(this.mountPoint).deleteOnExit();
|
||||
} catch (IOException | InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
}
|
||||
}
|
||||
|
||||
public static class Task extends Reader.Task {
|
||||
|
||||
private Configuration jobConfig;
|
||||
private String mountPoint;
|
||||
private Set<String> fileType;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.jobConfig = super.getPluginJobConf();
|
||||
this.mountPoint = this.jobConfig.getString("mountPoint");
|
||||
this.fileType = new HashSet<>(this.jobConfig.getList("fileType", Collections.emptyList(), String.class));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startRead(RecordSender recordSender) {
|
||||
try (Stream<Path> stream = Files.list(Paths.get(this.mountPoint))) {
|
||||
List<String> files = stream.filter(Files::isRegularFile)
|
||||
.filter(file -> fileType.isEmpty() || fileType.contains(getFileSuffix(file)))
|
||||
.map(path -> path.getFileName().toString())
|
||||
.collect(Collectors.toList());
|
||||
files.forEach(filePath -> {
|
||||
Record record = recordSender.createRecord();
|
||||
record.addColumn(new StringColumn(filePath));
|
||||
recordSender.sendToWriter(record);
|
||||
});
|
||||
this.jobConfig.set("columnNumber", 1);
|
||||
} catch (IOException e) {
|
||||
LOG.error("Error reading files from mount point: {}", this.mountPoint, e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String getFileSuffix(Path path) {
|
||||
String fileName = path.getFileName().toString();
|
||||
int lastDotIndex = fileName.lastIndexOf('.');
|
||||
if (lastDotIndex == -1 || lastDotIndex == fileName.length() - 1) {
|
||||
return "";
|
||||
}
|
||||
return fileName.substring(lastDotIndex + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
}
|
||||
}
|
||||
}
|
||||
6
runtime/datax/nfsreader/src/main/resources/plugin.json
Normal file
6
runtime/datax/nfsreader/src/main/resources/plugin.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"name": "nfsreader",
|
||||
"class": "com.modelengine.edatamate.plugin.reader.nfsreader.NfsReader",
|
||||
"description": "read from nas file system",
|
||||
"developer": "modelengine"
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"name": "nfsreader",
|
||||
"parameter": {
|
||||
"ip": "127.0.0.1",
|
||||
"path": "/test"
|
||||
}
|
||||
}
|
||||
77
runtime/datax/nfswriter/pom.xml
Normal file
77
runtime/datax/nfswriter/pom.xml
Normal file
@@ -0,0 +1,77 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>datax-all</artifactId>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>nfswriter</artifactId>
|
||||
<name>nfswriter</name>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-core</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-common</artifactId>
|
||||
<version>${datax-project-version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>ch.qos.logback</groupId>
|
||||
<artifactId>logback-classic</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/java</directory>
|
||||
<includes>
|
||||
<include>**/*.properties</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
<plugins>
|
||||
<!-- compiler plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>${jdk-version}</source>
|
||||
<target>${jdk-version}</target>
|
||||
<encoding>${project-sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<!-- assembly plugin -->
|
||||
<plugin>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<configuration>
|
||||
<descriptors>
|
||||
<descriptor>src/main/assembly/package.xml</descriptor>
|
||||
</descriptors>
|
||||
<finalName>datax</finalName>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>dwzip</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
35
runtime/datax/nfswriter/src/main/assembly/package.xml
Normal file
35
runtime/datax/nfswriter/src/main/assembly/package.xml
Normal file
@@ -0,0 +1,35 @@
|
||||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
<id></id>
|
||||
<formats>
|
||||
<format>dir</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<directory>src/main/resources</directory>
|
||||
<includes>
|
||||
<include>plugin.json</include>
|
||||
<include>plugin_job_template.json</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/writer/nfswriter</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>target/</directory>
|
||||
<includes>
|
||||
<include>nfswriter-0.0.1-SNAPSHOT.jar</include>
|
||||
</includes>
|
||||
<outputDirectory>plugin/writer/nfswriter</outputDirectory>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
<useProjectArtifact>false</useProjectArtifact>
|
||||
<outputDirectory>plugin/writer/nfswriter/libs</outputDirectory>
|
||||
<scope>runtime</scope>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
||||
@@ -0,0 +1,121 @@
|
||||
package com.modelengine.edatamate.plugin.writer.nfswriter;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.DirectoryNotEmptyException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 一个简单的 Linux NAS 挂载工具类
|
||||
* 仅适用于 Linux,需具备 sudo 权限或 root。
|
||||
*/
|
||||
public final class MountUtil {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MountUtil.class);
|
||||
|
||||
private MountUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 挂载远程目录
|
||||
*
|
||||
* @param remote 远程地址,如 192.168.1.1:/test
|
||||
* @param mountPoint 本地挂载点,如 /mnt/nas
|
||||
* @param type 文件系统类型:nfs、cifs ...
|
||||
* @param options 额外挂载参数,如 ro,vers=3 或 username=xxx,password=xxx
|
||||
*/
|
||||
public static void mount(String remote, String mountPoint, String type, String options) {
|
||||
try {
|
||||
Path mp = Paths.get(mountPoint);
|
||||
if (isMounted(mountPoint)) {
|
||||
throw new IOException("Already mounted: " + mountPoint);
|
||||
}
|
||||
|
||||
Files.createDirectories(mp);
|
||||
|
||||
ProcessBuilder pb = new ProcessBuilder();
|
||||
if (options == null || options.isEmpty()) {
|
||||
pb.command("mount", "-t", type, remote, mountPoint);
|
||||
} else {
|
||||
pb.command("mount", "-t", type, "-o", options, remote, mountPoint);
|
||||
}
|
||||
LOG.info(pb.command().toString());
|
||||
pb.redirectErrorStream(true);
|
||||
Process p = pb.start();
|
||||
StringBuilder output = new StringBuilder();
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream()))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
output.append(line).append(System.lineSeparator());
|
||||
}
|
||||
}
|
||||
int rc = p.waitFor();
|
||||
if (rc != 0) {
|
||||
throw new RuntimeException("Mount failed, exit=" + rc + ", output: " + output);
|
||||
}
|
||||
} catch (IOException | InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 卸载挂载点
|
||||
*
|
||||
* @param mountPoint 挂载点路径
|
||||
* @throws IOException 卸载失败
|
||||
* @throws InterruptedException 进程等待中断
|
||||
*/
|
||||
public static void umount(String mountPoint) throws IOException, InterruptedException {
|
||||
if (!isMounted(mountPoint)) {
|
||||
return;
|
||||
}
|
||||
|
||||
ProcessBuilder pb = new ProcessBuilder("umount", "-l", mountPoint);
|
||||
pb.redirectErrorStream(true);
|
||||
Process p = pb.start();
|
||||
StringBuilder output = new StringBuilder();
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream()))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
output.append(line).append(System.lineSeparator());
|
||||
}
|
||||
}
|
||||
int rc = p.waitFor();
|
||||
if (rc != 0) {
|
||||
throw new RuntimeException("Mount failed, exit=" + rc + ", output: " + output);
|
||||
}
|
||||
|
||||
// 清理空目录
|
||||
try {
|
||||
Files.deleteIfExists(Paths.get(mountPoint));
|
||||
} catch (DirectoryNotEmptyException ignore) {
|
||||
// 目录非空,保留
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断挂载点是否已挂载
|
||||
*
|
||||
* @param mountPoint 挂载点路径
|
||||
* @return true 表示已挂载
|
||||
* @throws IOException 读取 /proc/mounts 失败
|
||||
*/
|
||||
public static boolean isMounted(String mountPoint) throws IOException {
|
||||
Path procMounts = Paths.get("/proc/mounts");
|
||||
if (!Files.exists(procMounts)) {
|
||||
throw new IOException("/proc/mounts not found");
|
||||
}
|
||||
String expected = mountPoint.trim();
|
||||
List<String> lines = Files.readAllLines(procMounts);
|
||||
return lines.stream()
|
||||
.map(l -> l.split("\\s+"))
|
||||
.filter(a -> a.length >= 2)
|
||||
.anyMatch(a -> a[1].equals(expected));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
package com.modelengine.edatamate.plugin.writer.nfswriter;
|
||||
|
||||
import com.alibaba.datax.common.element.Record;
|
||||
import com.alibaba.datax.common.exception.CommonErrorCode;
|
||||
import com.alibaba.datax.common.exception.DataXException;
|
||||
import com.alibaba.datax.common.plugin.RecordReceiver;
|
||||
import com.alibaba.datax.common.spi.Writer;
|
||||
import com.alibaba.datax.common.util.Configuration;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public class NfsWriter extends Writer {
|
||||
public static class Job extends Writer.Job {
|
||||
private Configuration jobConfig;
|
||||
private String mountPoint;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.jobConfig = super.getPluginJobConf();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepare() {
|
||||
this.mountPoint = "/dataset/mount/" + UUID.randomUUID();
|
||||
this.jobConfig.set("mountPoint", this.mountPoint);
|
||||
new File(this.mountPoint).mkdirs();
|
||||
MountUtil.mount(this.jobConfig.getString("ip") + ":" + this.jobConfig.getString("path"),
|
||||
mountPoint, "nfs", StringUtils.EMPTY);
|
||||
String destPath = this.jobConfig.getString("destPath");
|
||||
new File(destPath).mkdirs();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Configuration> split(int mandatoryNumber) {
|
||||
return Collections.singletonList(this.jobConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void post() {
|
||||
try {
|
||||
MountUtil.umount(this.mountPoint);
|
||||
new File(this.mountPoint).deleteOnExit();
|
||||
} catch (IOException | InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
}
|
||||
}
|
||||
|
||||
public static class Task extends Writer.Task {
|
||||
private Configuration jobConfig;
|
||||
private String mountPoint;
|
||||
private String destPath;
|
||||
private List<String> files;
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
this.jobConfig = super.getPluginJobConf();
|
||||
this.destPath = this.jobConfig.getString("destPath");
|
||||
this.mountPoint = this.jobConfig.getString("mountPoint");
|
||||
this.files = this.jobConfig.getList("files", Collections.emptyList(), String.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startWrite(RecordReceiver lineReceiver) {
|
||||
try {
|
||||
Record record;
|
||||
while ((record = lineReceiver.getFromReader()) != null) {
|
||||
String fileName = record.getColumn(0).asString();
|
||||
if (StringUtils.isBlank(fileName)) {
|
||||
continue;
|
||||
}
|
||||
if (!files.isEmpty() && !files.contains(fileName)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String filePath = this.mountPoint + "/" + fileName;
|
||||
ShellUtil.runCommand("rsync", Arrays.asList("--no-links", "--chmod=750", "--", filePath,
|
||||
this.destPath + "/" + fileName));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw DataXException.asDataXException(CommonErrorCode.RUNTIME_ERROR, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
package com.modelengine.edatamate.plugin.writer.nfswriter;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class ShellUtil {
|
||||
/**
|
||||
* 执行 rsync 命令
|
||||
*
|
||||
* @param cmd 命令
|
||||
* @param extraArgs 额外参数,可为空
|
||||
* @return 命令完整输出(stdout + stderr)
|
||||
* @throws Exception 如果 rsync 返回非 0 或发生 IO 异常
|
||||
*/
|
||||
public static String runCommand(String cmd, List<String> extraArgs) throws Exception {
|
||||
List<String> commands = new ArrayList<>();
|
||||
commands.add(cmd);
|
||||
if (extraArgs != null && !extraArgs.isEmpty()) {
|
||||
commands.addAll(extraArgs);
|
||||
}
|
||||
|
||||
ProcessBuilder pb = new ProcessBuilder(commands);
|
||||
pb.redirectErrorStream(true); // 合并 stdout & stderr
|
||||
Process p = pb.start();
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
try (BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(p.getInputStream()))) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
sb.append(line).append(System.lineSeparator());
|
||||
}
|
||||
}
|
||||
|
||||
int exit = p.waitFor();
|
||||
if (exit != 0) {
|
||||
throw new RuntimeException("rsync exited with code " + exit + System.lineSeparator() + sb);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
6
runtime/datax/nfswriter/src/main/resources/plugin.json
Normal file
6
runtime/datax/nfswriter/src/main/resources/plugin.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"name": "nfswriter",
|
||||
"class": "com.modelengine.edatamate.plugin.writer.nfswriter.NfsWriter",
|
||||
"description": "write to local",
|
||||
"developer": "modelengine"
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"name": "nfswriter",
|
||||
"parameter": {
|
||||
"ip": "127.0.0.1",
|
||||
"path": "/test",
|
||||
"destPath": ""
|
||||
}
|
||||
}
|
||||
585
runtime/datax/package.xml
Normal file
585
runtime/datax/package.xml
Normal file
@@ -0,0 +1,585 @@
|
||||
<assembly
|
||||
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
|
||||
<id></id>
|
||||
<formats>
|
||||
<format>tar.gz</format>
|
||||
<format>dir</format>
|
||||
</formats>
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<directory>transformer/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>core/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
|
||||
<!-- reader -->
|
||||
<fileSet>
|
||||
<directory>mysqlreader/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>oceanbasev10reader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>obhbasereader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>drdsreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>oraclereader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>sqlserverreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<fileSet>
|
||||
<directory>postgresqlreader/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>kingbaseesreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>rdbmsreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>odpsreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>otsreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>otsstreamreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>txtfilereader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>ossreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>mongodbreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>tdenginereader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>streamreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>ftpreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>clickhousereader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hdfsreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hbase11xreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hbase094xreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>opentsdbreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>cassandrareader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>gdbreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hbase11xsqlreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hbase20xsqlreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>tsdbreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>datahubreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>loghubreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>starrocksreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>dorisreader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>sybasereader/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<fileSet>
|
||||
<directory>gaussdbreader/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>nfsreader/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
|
||||
<!-- writer -->
|
||||
<fileSet>
|
||||
<directory>mysqlwriter/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>tdenginewriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>starrockswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>drdswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>odpswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>doriswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>txtfilewriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>ftpwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>osswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>adswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>streamwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>otswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>mongodbwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>oraclewriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>sqlserverwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<fileSet>
|
||||
<directory>postgresqlwriter/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>kingbaseeswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>rdbmswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>ocswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hdfswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hbase11xwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hbase094xwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hbase11xsqlwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>elasticsearchwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hbase20xsqlwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>tsdbwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>adbpgwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>cassandrawriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>clickhousewriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>databendwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>oscarwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>oceanbasev10writer/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>obhbasewriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>gdbwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>kuduwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>hologresjdbcwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>datahubwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>loghubwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>selectdbwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>neo4jwriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>sybasewriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<fileSet>
|
||||
<directory>gaussdbwriter/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
<!-- <fileSet>-->
|
||||
<!-- <directory>milvuswriter/target/datax/</directory>-->
|
||||
<!-- <includes>-->
|
||||
<!-- <include>**/*.*</include>-->
|
||||
<!-- </includes>-->
|
||||
<!-- <outputDirectory>datax</outputDirectory>-->
|
||||
<!-- </fileSet>-->
|
||||
<fileSet>
|
||||
<directory>nfswriter/target/datax/</directory>
|
||||
<includes>
|
||||
<include>**/*.*</include>
|
||||
</includes>
|
||||
<outputDirectory>datax</outputDirectory>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
</assembly>
|
||||
308
runtime/datax/pom.xml
Normal file
308
runtime/datax/pom.xml
Normal file
@@ -0,0 +1,308 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.alibaba.datax</groupId>
|
||||
<artifactId>datax-all</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.hamcrest</groupId>
|
||||
<artifactId>hamcrest-core</artifactId>
|
||||
<version>1.3</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<name>datax-all</name>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<properties>
|
||||
<jdk-version>1.8</jdk-version>
|
||||
<datax-project-version>0.0.1-SNAPSHOT</datax-project-version>
|
||||
<commons-lang3-version>3.3.2</commons-lang3-version>
|
||||
<commons-configuration-version>1.10</commons-configuration-version>
|
||||
<commons-cli-version>1.2</commons-cli-version>
|
||||
<fastjson-version>2.0.23</fastjson-version>
|
||||
<guava-version>16.0.1</guava-version>
|
||||
<diamond.version>3.7.2.1-SNAPSHOT</diamond.version>
|
||||
|
||||
<!--slf4j 1.7.10 和 logback-classic 1.0.13 是好基友 -->
|
||||
<slf4j-api-version>1.7.10</slf4j-api-version>
|
||||
<logback-classic-version>1.0.13</logback-classic-version>
|
||||
<commons-io-version>2.4</commons-io-version>
|
||||
<junit-version>4.13.1</junit-version>
|
||||
<tddl.version>5.1.22-1</tddl.version>
|
||||
<swift-version>1.0.0</swift-version>
|
||||
|
||||
<project-sourceEncoding>UTF-8</project-sourceEncoding>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
<maven.compiler.encoding>UTF-8</maven.compiler.encoding>
|
||||
<mysql.driver.version>8.0.33</mysql.driver.version>
|
||||
</properties>
|
||||
|
||||
<modules>
|
||||
<module>common</module>
|
||||
<module>core</module>
|
||||
<module>transformer</module>
|
||||
|
||||
<!-- reader -->
|
||||
<module>mysqlreader</module>
|
||||
<!-- <module>drdsreader</module>-->
|
||||
<!-- <module>sqlserverreader</module>-->
|
||||
<module>postgresqlreader</module>
|
||||
<!-- <module>kingbaseesreader</module>-->
|
||||
<!-- <module>oraclereader</module>-->
|
||||
<!-- <module>cassandrareader</module>-->
|
||||
<!-- <module>oceanbasev10reader</module>-->
|
||||
<!-- <module>obhbasereader</module>-->
|
||||
<!-- <module>rdbmsreader</module>-->
|
||||
|
||||
<!-- <module>odpsreader</module>-->
|
||||
<!-- <module>otsreader</module>-->
|
||||
<!-- <module>otsstreamreader</module>-->
|
||||
<!-- <module>hbase11xreader</module>-->
|
||||
<!-- <module>hbase094xreader</module>-->
|
||||
<!-- <module>hbase11xsqlreader</module>-->
|
||||
<!-- <module>hbase20xsqlreader</module>-->
|
||||
|
||||
<!-- <module>ossreader</module>-->
|
||||
<!-- <module>hdfsreader</module>-->
|
||||
<!-- <module>ftpreader</module>-->
|
||||
<!-- <module>txtfilereader</module>-->
|
||||
<!-- <module>streamreader</module>-->
|
||||
<!-- <module>clickhousereader</module>-->
|
||||
|
||||
<!-- <module>mongodbreader</module>-->
|
||||
<!-- <module>tdenginereader</module>-->
|
||||
<!-- <module>gdbreader</module>-->
|
||||
<!-- <module>tsdbreader</module>-->
|
||||
<!-- <module>opentsdbreader</module>-->
|
||||
<!-- <module>loghubreader</module>-->
|
||||
<!-- <module>datahubreader</module>-->
|
||||
<!-- <module>starrocksreader</module>-->
|
||||
<!-- <module>sybasereader</module>-->
|
||||
<!-- <module>dorisreader</module>-->
|
||||
<module>nfsreader</module>
|
||||
<!-- writer -->
|
||||
<module>mysqlwriter</module>
|
||||
<!-- <module>starrockswriter</module>-->
|
||||
<!-- <module>drdswriter</module>-->
|
||||
<!-- <module>databendwriter</module>-->
|
||||
<!-- <module>oraclewriter</module>-->
|
||||
<!-- <module>sqlserverwriter</module>-->
|
||||
<module>postgresqlwriter</module>
|
||||
<!-- <module>kingbaseeswriter</module>-->
|
||||
<!-- <module>adswriter</module>-->
|
||||
<!-- <module>oceanbasev10writer</module>-->
|
||||
<!-- <module>obhbasewriter</module>-->
|
||||
<!-- <module>adbpgwriter</module>-->
|
||||
<!-- <module>hologresjdbcwriter</module>-->
|
||||
<!-- <module>rdbmswriter</module>-->
|
||||
|
||||
|
||||
<!-- <module>odpswriter</module>-->
|
||||
<!-- <module>osswriter</module>-->
|
||||
<!-- <module>otswriter</module>-->
|
||||
<!-- <module>hbase11xwriter</module>-->
|
||||
<!-- <module>hbase094xwriter</module>-->
|
||||
<!-- <module>hbase11xsqlwriter</module>-->
|
||||
<!-- <module>hbase20xsqlwriter</module>-->
|
||||
<!-- <module>kuduwriter</module>-->
|
||||
<!-- <module>ftpwriter</module>-->
|
||||
<!-- <module>hdfswriter</module>-->
|
||||
<!-- <module>txtfilewriter</module>-->
|
||||
<!-- <module>streamwriter</module>-->
|
||||
|
||||
<!-- <module>elasticsearchwriter</module>-->
|
||||
<!-- <module>mongodbwriter</module>-->
|
||||
<!-- <module>tdenginewriter</module>-->
|
||||
<!-- <module>ocswriter</module>-->
|
||||
<!-- <module>tsdbwriter</module>-->
|
||||
<!-- <module>gdbwriter</module>-->
|
||||
<!-- <module>oscarwriter</module>-->
|
||||
<!-- <module>loghubwriter</module>-->
|
||||
<!-- <module>datahubwriter</module>-->
|
||||
<!-- <module>cassandrawriter</module>-->
|
||||
<!-- <module>clickhousewriter</module>-->
|
||||
<!-- <module>doriswriter</module>-->
|
||||
<!-- <module>selectdbwriter</module>-->
|
||||
<!-- <module>adbmysqlwriter</module>-->
|
||||
<!-- <module>sybasewriter</module>-->
|
||||
<!-- <module>neo4jwriter</module>-->
|
||||
<!-- <module>milvuswriter</module>-->
|
||||
<module>nfswriter</module>
|
||||
<!-- common support module -->
|
||||
<module>plugin-rdbms-util</module>
|
||||
<module>plugin-unstructured-storage-util</module>
|
||||
<module>gaussdbreader</module>
|
||||
<module>gaussdbwriter</module>
|
||||
<!-- <module>datax-example</module>-->
|
||||
|
||||
</modules>
|
||||
|
||||
<dependencyManagement>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>${commons-lang3-version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.alibaba.fastjson2</groupId>
|
||||
<artifactId>fastjson2</artifactId>
|
||||
<version>${fastjson-version}</version>
|
||||
</dependency>
|
||||
<!--<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>${guava-version}</version>
|
||||
</dependency>-->
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<version>${commons-io-version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>${slf4j-api-version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ch.qos.logback</groupId>
|
||||
<artifactId>logback-classic</artifactId>
|
||||
<version>${logback-classic-version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.taobao.tddl</groupId>
|
||||
<artifactId>tddl-client</artifactId>
|
||||
<version>${tddl.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>com.taobao.diamond</groupId>
|
||||
<artifactId>diamond-client</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.taobao.diamond</groupId>
|
||||
<artifactId>diamond-client</artifactId>
|
||||
<version>${diamond.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba.search.swift</groupId>
|
||||
<artifactId>swift_client</artifactId>
|
||||
<version>${swift-version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>${junit-version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-all</artifactId>
|
||||
<version>1.9.5</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-api</artifactId>
|
||||
<version>2.17.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-core</artifactId>
|
||||
<version>2.17.1</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>central</id>
|
||||
<name>Nexus aliyun</name>
|
||||
<url>https://maven.aliyun.com/repository/central</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>spring</id>
|
||||
<name>spring</name>
|
||||
<url>https://maven.aliyun.com/repository/spring</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
<pluginRepositories>
|
||||
<pluginRepository>
|
||||
<id>central</id>
|
||||
<name>Nexus aliyun</name>
|
||||
<url>https://maven.aliyun.com/repository/central</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
</pluginRepository>
|
||||
</pluginRepositories>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/java</directory>
|
||||
<includes>
|
||||
<include>**/*.properties</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<version>2.2-beta-5</version>
|
||||
<configuration>
|
||||
<finalName>datax</finalName>
|
||||
<descriptors>
|
||||
<descriptor>package.xml</descriptor>
|
||||
</descriptors>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>make-assembly</id>
|
||||
<phase>package</phase>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>2.3.2</version>
|
||||
<configuration>
|
||||
<source>${jdk-version}</source>
|
||||
<target>${jdk-version}</target>
|
||||
<encoding>${project-sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
89
runtime/ops/README.md
Normal file
89
runtime/ops/README.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# 自定义算子开发指南
|
||||
|
||||
## 算子规范
|
||||
|
||||
### 算子元数据格式
|
||||
|
||||
每个自定义算子都需要包含一个 `metadata.yml` 文件:
|
||||
|
||||
```yaml
|
||||
name: '落盘算子'
|
||||
name_en: 'save file operator'
|
||||
description: '将文件内容保存为文件。'
|
||||
description_en: 'Save the file data as a file.'
|
||||
language: 'Python'
|
||||
vendor: 'Huawei'
|
||||
raw_id: 'FileExporter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'others'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'all'
|
||||
outputs: 'all'
|
||||
```
|
||||
|
||||
### 算子实现
|
||||
|
||||
创建 `process.py` 文件:
|
||||
|
||||
```python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: Json文本抽取
|
||||
Create: 2024/06/06 15:43
|
||||
"""
|
||||
import time
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class TextFormatter(Mapper):
|
||||
"""把输入的json文件流抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TextFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _extract_json(byte_io):
|
||||
"""将默认使用utf-8编码的Json文件流解码,抽取为txt"""
|
||||
# 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
|
||||
return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
|
||||
|
||||
def byte_read(self, sample: Dict[str, Any]):
|
||||
filepath = sample[self.filepath_key]
|
||||
with open(filepath, "rb") as file:
|
||||
byte_data = file.read()
|
||||
sample[self.data_key] = byte_data
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
try:
|
||||
self.byte_read(sample)
|
||||
sample[self.text_key] = self._extract_json(sample[self.data_key])
|
||||
sample[self.data_key] = b"" # 将sample[self.data_key]置空
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
|
||||
```
|
||||
|
||||
创建 `__init__.py` 文件:
|
||||
|
||||
```python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='TextFormatter',
|
||||
module_path="ops.formatter.text_formatter.process")
|
||||
|
||||
```
|
||||
49
runtime/ops/examples/text_length_filter/metadata.json
Normal file
49
runtime/ops/examples/text_length_filter/metadata.json
Normal file
@@ -0,0 +1,49 @@
|
||||
{
|
||||
"name": "text_length_filter",
|
||||
"displayName": "文本长度过滤器",
|
||||
"version": "1.0.0",
|
||||
"author": "DataMate Team",
|
||||
"description": "根据文本长度过滤数据,支持最小和最大长度限制",
|
||||
"category": "数据清洗",
|
||||
"type": "CUSTOM",
|
||||
"inputs": [
|
||||
{
|
||||
"name": "input_data",
|
||||
"type": "array",
|
||||
"description": "输入文本数组",
|
||||
"required": true
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "filtered_data",
|
||||
"type": "array",
|
||||
"description": "过滤后的文本数组"
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "min_length",
|
||||
"type": "integer",
|
||||
"description": "最小文本长度",
|
||||
"default": 10,
|
||||
"min": 0
|
||||
},
|
||||
{
|
||||
"name": "max_length",
|
||||
"type": "integer",
|
||||
"description": "最大文本长度",
|
||||
"default": 1000,
|
||||
"min": 1
|
||||
},
|
||||
{
|
||||
"name": "text_field",
|
||||
"type": "string",
|
||||
"description": "文本字段名称(如果输入是对象数组)",
|
||||
"default": "text"
|
||||
}
|
||||
],
|
||||
"tags": ["文本处理", "数据过滤", "长度检查"],
|
||||
"documentation": "https://docs.datamate.com/operators/text-length-filter",
|
||||
"repository": "https://github.com/datamate/operators/tree/main/text-length-filter"
|
||||
}
|
||||
135
runtime/ops/examples/text_length_filter/operator.py
Normal file
135
runtime/ops/examples/text_length_filter/operator.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""
|
||||
文本长度过滤器算子
|
||||
根据设定的最小和最大长度过滤文本数据
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, List, Union
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class TextLengthFilter:
|
||||
"""文本长度过滤器算子"""
|
||||
|
||||
def __init__(self):
|
||||
self.name = "text_length_filter"
|
||||
self.version = "1.0.0"
|
||||
|
||||
def execute(self, config: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行文本长度过滤"""
|
||||
|
||||
logger.info(f"开始执行算子: {self.name}")
|
||||
|
||||
# 获取参数
|
||||
parameters = config.get('parameters', {})
|
||||
min_length = parameters.get('min_length', 10)
|
||||
max_length = parameters.get('max_length', 1000)
|
||||
text_field = parameters.get('text_field', 'text')
|
||||
|
||||
logger.info(f"过滤参数: min_length={min_length}, max_length={max_length}, text_field={text_field}")
|
||||
|
||||
# 验证参数
|
||||
if min_length < 0:
|
||||
raise ValueError("min_length must be >= 0")
|
||||
if max_length < min_length:
|
||||
raise ValueError("max_length must be >= min_length")
|
||||
|
||||
# 读取输入数据
|
||||
input_path = context['input_path']
|
||||
with open(input_path, 'r', encoding='utf-8') as f:
|
||||
input_data = json.load(f)
|
||||
|
||||
if not isinstance(input_data, list):
|
||||
raise ValueError("输入数据必须是数组格式")
|
||||
|
||||
logger.info(f"输入数据条数: {len(input_data)}")
|
||||
|
||||
# 执行过滤
|
||||
filtered_data = []
|
||||
stats = {
|
||||
'total_input': len(input_data),
|
||||
'too_short': 0,
|
||||
'too_long': 0,
|
||||
'filtered_out': 0,
|
||||
'kept': 0
|
||||
}
|
||||
|
||||
for i, item in enumerate(input_data):
|
||||
try:
|
||||
# 提取文本内容
|
||||
if isinstance(item, str):
|
||||
text = item
|
||||
elif isinstance(item, dict) and text_field in item:
|
||||
text = str(item[text_field])
|
||||
else:
|
||||
logger.warning(f"跳过无法处理的数据项 {i}: {type(item)}")
|
||||
stats['filtered_out'] += 1
|
||||
continue
|
||||
|
||||
# 检查长度
|
||||
text_length = len(text)
|
||||
|
||||
if text_length < min_length:
|
||||
stats['too_short'] += 1
|
||||
stats['filtered_out'] += 1
|
||||
elif text_length > max_length:
|
||||
stats['too_long'] += 1
|
||||
stats['filtered_out'] += 1
|
||||
else:
|
||||
filtered_data.append(item)
|
||||
stats['kept'] += 1
|
||||
|
||||
# 进度报告
|
||||
if (i + 1) % 1000 == 0:
|
||||
progress = (i + 1) / len(input_data) * 100
|
||||
logger.info(f"处理进度: {progress:.1f}% ({i + 1}/{len(input_data)})")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"处理数据项 {i} 时出错: {e}")
|
||||
stats['filtered_out'] += 1
|
||||
continue
|
||||
|
||||
# 保存结果
|
||||
output_path = context['output_path']
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(filtered_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# 准备返回结果
|
||||
result = {
|
||||
'status': 'success',
|
||||
'statistics': stats,
|
||||
'filter_rate': stats['filtered_out'] / stats['total_input'] * 100 if stats['total_input'] > 0 else 0,
|
||||
'output_path': output_path
|
||||
}
|
||||
|
||||
logger.info(f"过滤完成: {stats}")
|
||||
logger.info(f"过滤率: {result['filter_rate']:.2f}%")
|
||||
|
||||
return result
|
||||
|
||||
def validate_config(self, config: Dict[str, Any]) -> List[str]:
|
||||
"""验证配置参数"""
|
||||
errors = []
|
||||
parameters = config.get('parameters', {})
|
||||
|
||||
min_length = parameters.get('min_length')
|
||||
max_length = parameters.get('max_length')
|
||||
|
||||
if min_length is not None and not isinstance(min_length, int):
|
||||
errors.append("min_length must be an integer")
|
||||
|
||||
if max_length is not None and not isinstance(max_length, int):
|
||||
errors.append("max_length must be an integer")
|
||||
|
||||
if min_length is not None and min_length < 0:
|
||||
errors.append("min_length must be >= 0")
|
||||
|
||||
if min_length is not None and max_length is not None and max_length < min_length:
|
||||
errors.append("max_length must be >= min_length")
|
||||
|
||||
return errors
|
||||
|
||||
def create_operator():
|
||||
"""算子工厂函数"""
|
||||
return TextLengthFilter()
|
||||
29
runtime/ops/filter/__init__.py
Normal file
29
runtime/ops/filter/__init__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datamate.common.utils.custom_importer import CustomImporter
|
||||
|
||||
|
||||
def _configure_importer():
|
||||
base_path = Path(__file__).resolve().parent
|
||||
sys.meta_path.append(CustomImporter(base_path))
|
||||
|
||||
|
||||
_configure_importer()
|
||||
|
||||
|
||||
def _import_operators():
|
||||
from . import file_with_high_repeat_phrase_rate_filter
|
||||
from . import file_with_high_repeat_word_rate_filter
|
||||
from . import file_with_high_special_char_rate_filter
|
||||
from . import remove_file_with_many_sensitive_words
|
||||
from . import remove_file_with_short_or_long_length
|
||||
from . import remove_duplicate_file
|
||||
from . import img_blurred_images_cleaner
|
||||
from . import img_duplicated_images_cleaner
|
||||
from . import img_similar_images_cleaner
|
||||
from . import img_advertisement_images_cleaner
|
||||
|
||||
|
||||
_import_operators()
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FileWithHighRepeatPhraseRateFilter',
|
||||
module_path="ops.filter.file_with_high_repeat_phrase_rate_filter.process")
|
||||
@@ -0,0 +1,31 @@
|
||||
name: '文档词重复率检查'
|
||||
description: '去除重复词过多的文档。'
|
||||
language: 'Python'
|
||||
vendor: 'Huawei'
|
||||
raw_id: 'FileWithHighRepeatPhraseRateFilter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '机器机器机器机器机器机器机器机器机器机器学习学习学习学习学习'
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
settings:
|
||||
repeatPhraseRatio:
|
||||
name: 文档词重复率
|
||||
description: 某个词的统计数/文档总词数 > 设定值,该文档被去除。
|
||||
type: slider
|
||||
defaultVal: 0.5
|
||||
min: 0
|
||||
max: 1
|
||||
step: 0.1
|
||||
hitStopwords:
|
||||
name: 去除停用词
|
||||
description: 统计重复词时,选择是否要去除停用词。
|
||||
type: switch
|
||||
defaultVal: false
|
||||
required: true
|
||||
checkedLabel: 去除
|
||||
unCheckedLabel: 不去除
|
||||
@@ -0,0 +1,73 @@
|
||||
#!/user/bin/python
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 词重复率过高文档过滤插件
|
||||
Create: 2023/11/7 9:26
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
import jieba
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
|
||||
class FileWithHighRepeatPhraseRateFilter(Filter):
|
||||
"""词重复率过高文档过滤插件"""
|
||||
PUNCTUATION_PATTERN = re.compile(r'^[\u3000-\u303F\uff00-\uffef\s\W_]+$')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(FileWithHighRepeatPhraseRateFilter, self).__init__(*args, **kwargs)
|
||||
self._min_threshold = kwargs.get("repeatPhraseRatio", 0.5) # 重复词符占全文的比例阈值,默认值为0.5
|
||||
self._hit_stopword_trigger = kwargs.get("hitStopwords", False) # 计算重复词率时是否去除停用词,默认为False不去除,True为去除
|
||||
self._file_path = Path(__file__).parent / 'resources' / 'hit_stopwords.txt'
|
||||
self._hit_stopwords = []
|
||||
if self._hit_stopword_trigger:
|
||||
with open(self._file_path, 'r', encoding='utf-8') as f:
|
||||
self._hit_stopwords = f.read().splitlines()
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileWithHighRepeatPhraseRateFilter costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
def _tokenize_by_jieba(self, text: str):
|
||||
"""基于jieba对输入文本进行切分
|
||||
|
||||
Args:
|
||||
text: 输入文档内容
|
||||
Returns:
|
||||
words_list: 切割后的词列表
|
||||
"""
|
||||
|
||||
for word in jieba.lcut(text):
|
||||
if not self.PUNCTUATION_PATTERN.match(word) and word not in self._hit_stopwords:
|
||||
yield word
|
||||
|
||||
def _file_with_high_repeat_phrase_rate_filter(self, input_data: str, file_name):
|
||||
if len(input_data) < 2: # 词语长度至少2个字符
|
||||
return input_data
|
||||
words_list = self._tokenize_by_jieba(input_data)
|
||||
words_count = dict(Counter(words_list))
|
||||
words_count_max, words_total_count = 0, 0
|
||||
for words in words_count:
|
||||
# 只统计中文、字母,且长度大于1的词语
|
||||
if len(words) > 1 and words.isalpha():
|
||||
words_count_max = max(words_count_max, words_count.get(words))
|
||||
words_total_count += words_count.get(words)
|
||||
output_data = input_data
|
||||
repeat_phrase_rate = words_count_max / words_total_count if words_total_count > 0 else 0
|
||||
if repeat_phrase_rate >= self._min_threshold:
|
||||
# 只要有一个词重复率高于阈值,就会过滤文档
|
||||
output_data = ""
|
||||
logger.info(f"The repeat phrase rate of the input data is {repeat_phrase_rate}. "
|
||||
f"Threshold is {self._min_threshold}. The document {file_name} is filtered.")
|
||||
return output_data
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FileWithHighRepeatWordRateFilter',
|
||||
module_path="ops.filter.file_with_high_repeat_word_rate_filter.process")
|
||||
@@ -0,0 +1,25 @@
|
||||
name: '文档字重复率检查'
|
||||
name_en: 'Word Repetition Rate Check'
|
||||
description: '去除重复字过多的文档。'
|
||||
description_en: 'Filters out files that contain excessive repeated words.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'FileWithHighRepeatWordRateFilter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '机器学学学学学学学学学学学学学学学学学学学学学学学学学学学学学学习'
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
settings:
|
||||
repeatWordRatio:
|
||||
name: 文档字重复率
|
||||
description: 某个字的统计数/文档总字数 > 设定值,该文档被去除。
|
||||
type: slider
|
||||
defaultVal: 0.5
|
||||
min: 0
|
||||
max: 1
|
||||
step: 0.1
|
||||
@@ -0,0 +1,51 @@
|
||||
#!/user/bin/python
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 检查文档字重复率插件
|
||||
Create: 2023/11/7 9:26
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
|
||||
from collections import Counter
|
||||
from typing import Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
|
||||
class FileWithHighRepeatWordRateFilter(Filter):
|
||||
"""检查文档字重复率插件"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(FileWithHighRepeatWordRateFilter, self).__init__(*args, **kwargs)
|
||||
self._min_threshold = kwargs.get("repeatWordRatio", 0.5) # 重复字符占整行的比例阈值,默认值为0.5
|
||||
|
||||
@staticmethod
|
||||
def _extract_word(input_data):
|
||||
# 只统计中文字的重复率
|
||||
extracted_word = re.sub(r'[^\u4e00-\u9fff]', '', input_data)
|
||||
return extracted_word
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileWithHighRepeatWordRateFilter costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
def _file_with_high_repeat_word_rate_filter(self, input_data: str, file_name):
|
||||
tmp = self._extract_word(input_data)
|
||||
if not tmp:
|
||||
return input_data
|
||||
output_data = input_data
|
||||
words_count = Counter(tmp)
|
||||
max_value = max(words_count.values())
|
||||
repeat_word_rate = max_value / len(tmp)
|
||||
if repeat_word_rate >= self._min_threshold:
|
||||
output_data = ""
|
||||
logger.info(f"The repeat word rate of the input data is {repeat_word_rate}. "
|
||||
f"Threshold is {self._min_threshold}. The document %s is filtered.")
|
||||
return output_data
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FileWithHighSpecialCharRateFilter',
|
||||
module_path="ops.filter.file_with_high_special_char_rate_filter.process")
|
||||
@@ -0,0 +1,25 @@
|
||||
name: '文档特殊字符率检查'
|
||||
name_en: 'Special Character Rate Check'
|
||||
description: '去除特殊字符过多的文档。'
|
||||
description_en: 'Filters out files that contain excessive special characters.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'FileWithHighSpecialCharRateFilter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '你好!@!@#!¥!@#'
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
settings:
|
||||
specialCharRatio:
|
||||
name: 文档特殊字符率
|
||||
description: 特殊字符的统计数/文档总字数 > 设定值,该文档被去除。
|
||||
type: slider
|
||||
defaultVal: 0.3
|
||||
min: 0
|
||||
max: 1
|
||||
step: 0.1
|
||||
@@ -0,0 +1,49 @@
|
||||
#!/user/bin/python
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 文档特殊字符率检查
|
||||
Create: 2023/11/7 9:26
|
||||
"""
|
||||
import time
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
|
||||
class FileWithHighSpecialCharRateFilter(Filter):
|
||||
"""检查文档特殊字符率"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(FileWithHighSpecialCharRateFilter, self).__init__(*args, **kwargs)
|
||||
self._min_threshold = kwargs.get("specialCharRatio", 0.3) # 特殊字符占全文比例阈值,默认值为0.3
|
||||
self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
|
||||
with open(self._file_path, 'r', encoding='utf-8') as f:
|
||||
self._special_token = set(f.read().splitlines())
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileWithHighSpecialCharRateFilter costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
def _file_with_high_special_char_rate_filter(self, input_data: str, file_name):
|
||||
if not input_data:
|
||||
return ""
|
||||
|
||||
output_data = input_data
|
||||
total = 0
|
||||
for token in self._special_token:
|
||||
total += input_data.count(token)
|
||||
|
||||
special_char_rate = total / len(input_data)
|
||||
if special_char_rate >= self._min_threshold:
|
||||
logger.info(f"The special char rate of the input data is {special_char_rate}. "
|
||||
f"Threshold is {self._min_threshold}. The document {file_name} is filtered.")
|
||||
output_data = ""
|
||||
return output_data
|
||||
@@ -0,0 +1,50 @@
|
||||
~
|
||||
·
|
||||
!
|
||||
@
|
||||
#
|
||||
¥
|
||||
%
|
||||
…
|
||||
&
|
||||
*
|
||||
(
|
||||
)
|
||||
—
|
||||
+
|
||||
-
|
||||
=
|
||||
{
|
||||
}
|
||||
|
|
||||
【
|
||||
】
|
||||
、
|
||||
:
|
||||
“
|
||||
;
|
||||
‘
|
||||
《
|
||||
》
|
||||
?
|
||||
,
|
||||
。
|
||||
`
|
||||
!
|
||||
$
|
||||
^
|
||||
(
|
||||
)
|
||||
_
|
||||
[
|
||||
]
|
||||
\
|
||||
:
|
||||
"
|
||||
;
|
||||
'
|
||||
<
|
||||
>
|
||||
?
|
||||
,
|
||||
/
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgAdvertisementImagesCleaner',
|
||||
module_path="ops.filter.img_advertisement_images_cleaner.process")
|
||||
@@ -0,0 +1,16 @@
|
||||
name: '广告图片过滤'
|
||||
name_en: 'Ad Image Filter'
|
||||
description: '去除包含二维码的图片。'
|
||||
description_en: 'Removes images containing QR codes.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgAdvertisementImagesCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
127
runtime/ops/filter/img_advertisement_images_cleaner/process.py
Normal file
127
runtime/ops/filter/img_advertisement_images_cleaner/process.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
|
||||
"""
|
||||
Description:
|
||||
Create: 2024/1/22 20:49
|
||||
"""
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
from .wechat_qrcode_model import WechatQRCodeModel
|
||||
|
||||
|
||||
class ImgAdvertisementImagesCleaner(Filter):
|
||||
"""去除广告图片的插件,当前仅支持去除二维码"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgAdvertisementImagesCleaner, self).__init__(*args, **kwargs)
|
||||
self.img_resize = 1000 # 大图片的最长边压缩为1000
|
||||
self.use_model = True
|
||||
self.model = self.get_model(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _detect_qr_code_using_anchor_point(img):
|
||||
# 有些二维码和边缘紧贴,无法识别出整个矩形,所以我们先对图片大小进行扩展
|
||||
expand_length = 10
|
||||
edge = expand_length // 2
|
||||
h, w = img.shape[:2]
|
||||
image_extend = np.zeros((img.shape[0] + expand_length, img.shape[1] + expand_length, 3), np.uint8)
|
||||
image_extend[:] = 255
|
||||
image_extend[edge:edge + h, edge:edge + w] = img
|
||||
|
||||
# 转灰度、二值化、找轮廓
|
||||
gray = cv2.cvtColor(image_extend, cv2.COLOR_BGR2GRAY)
|
||||
# 中值滤波
|
||||
blur_image = cv2.medianBlur(gray, 5)
|
||||
_, thresh = cv2.threshold(blur_image, 127, 255, cv2.THRESH_BINARY)
|
||||
contours, hir = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
# 三个“回”字特征轮廓存储
|
||||
parent_contours_list = []
|
||||
hir_list = hir[0]
|
||||
for i, item in enumerate(hir_list):
|
||||
# 判断A轮廓是否有B轮廓
|
||||
if item[2] == -1:
|
||||
continue
|
||||
else:
|
||||
hir_b_index = item[2]
|
||||
# 判断B轮廓是否有C轮廓
|
||||
if hir_list[hir_b_index][2] == -1:
|
||||
continue
|
||||
hir_c_index = hir_list[hir_b_index][2]
|
||||
# 计算A轮廓的周长和C轮廓周长的比值
|
||||
hir_c_arc_length = cv2.arcLength(contours[hir_c_index], True)
|
||||
if hir_c_arc_length:
|
||||
error = cv2.arcLength(contours[i], True) / hir_c_arc_length
|
||||
# 二维码每一个“回”的黑白框框的比例大概为1:1:3:1:1
|
||||
# 理论上,A轮廓周长为28,C轮廓周长为12,A/C = error = 2.3333
|
||||
if 1.5 <= error <= 3:
|
||||
parent_contours_list.append(contours[i])
|
||||
|
||||
# 若找到3个以上“回”字,该图片含有二维码
|
||||
return len(parent_contours_list) >= 3
|
||||
|
||||
@staticmethod
|
||||
def _detect_qr_code_using_wechat_model(img, file_name, model):
|
||||
res = ""
|
||||
try:
|
||||
res, points = model.detectAndDecode(img)
|
||||
except UnicodeDecodeError as ex:
|
||||
res = ex.object.decode('ISO-8859-1').split(" ")[0]
|
||||
except Exception as err:
|
||||
logger.exception(f"fileName: {file_name}, method: ImgAdvertisementImagesCleaner. "
|
||||
f"An error occurred when using the WeChat model to detect the QR code. "
|
||||
f"The error is: {err}")
|
||||
if res:
|
||||
return True
|
||||
return False
|
||||
|
||||
def init_model(self, *args, **kwargs):
|
||||
return WechatQRCodeModel(*args, **kwargs).wechat_qr_model
|
||||
|
||||
def resize_img(self, image):
|
||||
"""图片等比压缩"""
|
||||
height, width = image.shape[:2] # 获取原图像的水平方向尺寸和垂直方向尺寸。
|
||||
temp = max(height, width)
|
||||
# 若图片最长边大于限值,对图片进行压缩,否则返回原图
|
||||
if temp >= self.img_resize:
|
||||
mul_temp = temp / self.img_resize
|
||||
if height > width:
|
||||
res = cv2.resize(image, (int(width / mul_temp), self.img_resize), interpolation=cv2.INTER_AREA)
|
||||
elif height < width:
|
||||
res = cv2.resize(image, (self.img_resize, int(height / mul_temp)), interpolation=cv2.INTER_AREA)
|
||||
else:
|
||||
res = cv2.resize(image, (self.img_resize, self.img_resize), interpolation=cv2.INTER_AREA)
|
||||
return res
|
||||
return image
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
if img_bytes:
|
||||
data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
image = self._detect_advertisement_img(data, file_name, self.model)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(image, file_type)
|
||||
logger.info(f"fileName: {file_name}, "
|
||||
f"method: ImgAdvertisementImagesCleaner costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
def _detect_advertisement_img(self, img, file_name, model):
|
||||
"""检测含有二维码的图片"""
|
||||
img_resize = self.resize_img(img)
|
||||
if self._detect_qr_code_using_wechat_model(img_resize, file_name, model) \
|
||||
or self._detect_qr_code_using_anchor_point(img_resize):
|
||||
logger.info(f"fileName: {file_name}, method: ImgAdvertisementImagesCleaner. "
|
||||
"The image contains advertisement. The image is filtered out.")
|
||||
return np.array([])
|
||||
return img
|
||||
@@ -0,0 +1,23 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
import gc
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
|
||||
|
||||
class WechatQRCodeModel:
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
models_path = os.getenv("MODELS_PATH", "/home/models")
|
||||
self.resources_path = str(Path(models_path, 'img_QRcode_detect', 'resources'))
|
||||
self.wechat_qr_model = cv2.wechat_qrcode_WeChatQRCode(
|
||||
str(Path(self.resources_path, 'detect.prototxt')),
|
||||
str(Path(self.resources_path, 'detect.caffemodel')),
|
||||
str(Path(self.resources_path, 'sr.prototxt')),
|
||||
str(Path(self.resources_path, 'sr.caffemodel')))
|
||||
|
||||
def __del__(self):
|
||||
del self.wechat_qr_model
|
||||
gc.collect()
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgBlurredImagesCleaner',
|
||||
module_path="ops.filter.img_blurred_images_cleaner.process")
|
||||
25
runtime/ops/filter/img_blurred_images_cleaner/metadata.yml
Normal file
25
runtime/ops/filter/img_blurred_images_cleaner/metadata.yml
Normal file
@@ -0,0 +1,25 @@
|
||||
name: '模糊图片过滤'
|
||||
name_en: 'Fuzzy Image Filter'
|
||||
description: '去除模糊的图片。'
|
||||
description_en: 'Filters out fuzzy images.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgBlurredImagesCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
settings:
|
||||
blurredThreshold:
|
||||
name: 梯度函数值
|
||||
description: 梯度函数值取值越小,图片模糊度越高。
|
||||
type: slider
|
||||
defaultVal: 1000
|
||||
min: 1
|
||||
max: 10000
|
||||
step: 1
|
||||
50
runtime/ops/filter/img_blurred_images_cleaner/process.py
Normal file
50
runtime/ops/filter/img_blurred_images_cleaner/process.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
Create: 2025/01/17
|
||||
"""
|
||||
import time
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
|
||||
class ImgBlurredImagesCleaner(Filter):
|
||||
"""过滤模糊度低于阈值的图片插件"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgBlurredImagesCleaner, self).__init__(*args, **kwargs)
|
||||
# 设置模糊度阈值
|
||||
self._blurred_threshold = kwargs.get("blurredThreshold", 1000)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
if img_bytes:
|
||||
data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
blurred_images = self._blurred_images_filter(data, file_name)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(blurred_images, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImagesBlurredCleaner costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
def _blurred_images_filter(self, image, file_name):
|
||||
# 为方便与其他图片比较可以将图片resize到同一个大小
|
||||
img_resize = cv2.resize(image, (112, 112))
|
||||
# 将图片压缩为单通道的灰度图
|
||||
gray = cv2.cvtColor(img_resize, cv2.COLOR_BGR2GRAY)
|
||||
score = cv2.Laplacian(gray, cv2.CV_64F).var()
|
||||
if score <= self._blurred_threshold:
|
||||
logger.info(f"The image blur is {self._blurred_threshold}, "
|
||||
f"which exceeds the threshold of {score}. {file_name} is filtered out.")
|
||||
return np.array([])
|
||||
return image
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgDuplicatedImagesCleaner',
|
||||
module_path="ops.filter.img_duplicated_images_cleaner.process")
|
||||
@@ -0,0 +1,16 @@
|
||||
name: '重复图片去除'
|
||||
name_en: 'Duplicate Image Removal'
|
||||
description: '去除重复的图片。'
|
||||
description_en: 'Removes duplicate images.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgDuplicatedImagesCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
109
runtime/ops/filter/img_duplicated_images_cleaner/process.py
Normal file
109
runtime/ops/filter/img_duplicated_images_cleaner/process.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
于MD5值计算当前图片与数据集中其它图片是否相同。相同该图片过滤,保留原数据集图片。
|
||||
将文件特征数据即MD5值,存到数据库。根据任务uuid获取历史文件特征,遍历特征并进行去重比较
|
||||
Create: 2025/1/7
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
from Crypto.Hash import MD5
|
||||
from sqlalchemy import text
|
||||
from loguru import logger
|
||||
|
||||
from datamate.sql_manager.sql_manager import SQLManager
|
||||
from datamate.common.utils import get_now_time
|
||||
from datamate.common.utils import bytes_to_numpy, numpy_to_bytes
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
|
||||
class ImgDuplicatedImagesCleaner(Filter):
|
||||
"""去除重复图片插件
|
||||
基于MD5值计算当前图片与数据集中其它图片是否相同。相同该图片过滤,保留原数据集图片。
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# task_uuid为标识该数据集的唯一标志
|
||||
super().__init__(*args, **kwargs)
|
||||
self.task_uuid = kwargs.get("uuid", "")
|
||||
self.img_resize = 200 # 图片压缩尺寸
|
||||
# 获取数据库sql
|
||||
self.sql_dict = self.load_sql_dict()
|
||||
# 获取数据库连接池
|
||||
self.conn = None # 数据库连接
|
||||
self.trans = None # 数据库事务
|
||||
|
||||
@staticmethod
|
||||
def load_sql_dict():
|
||||
"""获取sql语句"""
|
||||
sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
|
||||
with open(sql_config_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
def compute_md5(self, img_bytes: bytes) -> str:
|
||||
"""将图片统一转化为png无损格式,计算每张图像的md5值"""
|
||||
if not img_bytes:
|
||||
return ""
|
||||
img = bytes_to_numpy(img_bytes)
|
||||
height, width = img.shape[:2] # 获取原图像的水平方向尺寸和垂直方向尺寸。
|
||||
res = cv2.resize(img, (int(width / height * self.img_resize), self.img_resize), interpolation=cv2.INTER_AREA)
|
||||
img_bytes = numpy_to_bytes(res, ".png")
|
||||
hash_md5 = MD5.new()
|
||||
hash_md5.update(img_bytes)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""重复图片去重算子执行入口"""
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
||||
img_data = self._duplicate_images_filter(file_name, sample[self.data_key])
|
||||
sample[self.data_key] = img_data
|
||||
logger.info(
|
||||
f"fileName: {file_name}, method: DuplicateImagesCleaner costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
def execute_sql(self, md5: str, file_name: str,
|
||||
img_bytes: bytes) -> bytes:
|
||||
"""从数据库中获取文件特征、比较MD5,插入新的文件特征"""
|
||||
timestamp = get_now_time('Asia/Shanghai', '%Y-%m-%d %H:%M:%S', file_name,
|
||||
"DuplicateImagesCleaner")
|
||||
query_sql = str(self.sql_dict.get("query_sql"))
|
||||
insert_sql = str(self.sql_dict.get("insert_sql"))
|
||||
create_tables_sql = str(self.sql_dict.get("create_tables_sql"))
|
||||
query_sql_params = {"task_uuid": self.task_uuid, "file_feature": md5}
|
||||
insert_sql_params = {"task_uuid": self.task_uuid, "file_feature": md5, "file_name": file_name.encode("utf-8"),
|
||||
"timestamp": timestamp}
|
||||
|
||||
db_manager = SQLManager()
|
||||
try:
|
||||
self.conn = db_manager.create_connect()
|
||||
except Exception as e:
|
||||
logger.error(f"fileName: {file_name}, database connection failed: {str(e)}")
|
||||
raise RuntimeError(82000, str(e)) from None
|
||||
|
||||
with self.conn as connection:
|
||||
connection.execute(text(create_tables_sql))
|
||||
# 判断是否有重复文件
|
||||
result = connection.execute(text(query_sql, query_sql_params)).fetchall()
|
||||
# 查询记录为空,无重复图片, 插入新文件特征
|
||||
if not result:
|
||||
connection.execute(text(insert_sql, insert_sql_params))
|
||||
return img_bytes
|
||||
logger.info(f"taskId: {self.task_uuid} fileName: {file_name}, method: Duplicate ImagesCleaner. "
|
||||
f"The image is duplicated and filtered ")
|
||||
return b""
|
||||
|
||||
def _duplicate_images_filter(self, file_name: str, img_bytes: bytes) -> bytes:
|
||||
"""重复图片去重算子执行逻辑"""
|
||||
# 如果文件为空,则无需去重,返回原图
|
||||
if not img_bytes:
|
||||
return img_bytes
|
||||
md5 = self.compute_md5(img_bytes)
|
||||
return self.execute_sql(md5, file_name, img_bytes)
|
||||
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"query_sql": "SELECT * FROM operator_duplicate_img_features WHERE task_uuid = :task_uuid AND file_feature = :file_feature",
|
||||
"insert_sql": "INSERT INTO operator_duplicate_img_features (task_uuid, file_feature, file_name, timestamp) VALUES (:task_uuid, :file_feature, :file_name, :timestamp)",
|
||||
"create_tables_sql": "CREATE TABLE IF NOT EXISTS operator_duplicate_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);"
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgSimilarImagesCleaner',
|
||||
module_path="ops.filter.img_similar_images_cleaner.process")
|
||||
25
runtime/ops/filter/img_similar_images_cleaner/metadata.yml
Normal file
25
runtime/ops/filter/img_similar_images_cleaner/metadata.yml
Normal file
@@ -0,0 +1,25 @@
|
||||
name: '相似图片去除'
|
||||
name_en: 'Similar Image Removal'
|
||||
description: '去除相似的图片。'
|
||||
description_en: 'Removes similar images.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgSimilarImagesCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
settings:
|
||||
similarThreshold:
|
||||
name: 相似度
|
||||
description: 相似度取值越大,图片相似度越高。
|
||||
type: slider
|
||||
defaultVal: 0.8
|
||||
min: 0
|
||||
max: 1
|
||||
step: 0.01
|
||||
238
runtime/ops/filter/img_similar_images_cleaner/process.py
Normal file
238
runtime/ops/filter/img_similar_images_cleaner/process.py
Normal file
@@ -0,0 +1,238 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
1.本算子结合感知哈希算法和ORB两个算法判断图片的相似性
|
||||
2.感知哈希算法则是从图像的整体结构和特征维度来计算图片的相似度。
|
||||
3.ORB算法可以用来对图像中的关键点快速创建特征向量,这些特征向量可以用来识别图像中的对象。通过比较两张图片的特征向量计算相似度。
|
||||
4.感知哈希算法和ORB算法计算相似度高于0.75,则选择二者较大值;若低于0.75,则选择二者最小值作为相似度
|
||||
5.将文件特征数据存到数据库。根据任务uuid获取历史文件特征,遍历特征并进行去重比较
|
||||
Create: 2025/1/7
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
import zlib
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from sqlalchemy import text
|
||||
from loguru import logger
|
||||
|
||||
from datamate.sql_manager.sql_manager import SQLManager
|
||||
from datamate.common.utils import get_now_time
|
||||
from datamate.common.utils import bytes_to_numpy
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
MAX_RETRIES = 5
|
||||
BASE_DELAY = 1
|
||||
MAX_DELAY = 30 # 最大延时设置为30秒
|
||||
JITTER_FACTOR = 0.25 # 抖动因子为等待时间的25%
|
||||
MAX_FEATURES_NUM = 200
|
||||
|
||||
|
||||
def get_orb_des(image: np.ndarray) -> np.ndarray:
|
||||
"""检测图像中的特征点kp和计算这些特征点的描述符矩阵des_matrix"""
|
||||
if not image.size:
|
||||
return np.array([])
|
||||
orb = cv2.ORB_create() # 初始化ORB检测器
|
||||
orb.setMaxFeatures(MAX_FEATURES_NUM) # 设置最大特征点数量为200
|
||||
kp, des_matrix = orb.detectAndCompute(image, None)
|
||||
if des_matrix is None:
|
||||
# 若没有提取出图像特征,描述符矩阵置为空
|
||||
des_matrix = np.array([])
|
||||
return des_matrix
|
||||
|
||||
|
||||
class ImgSimilarImagesCleaner(Filter):
|
||||
"""去除相似图片的插件"""
|
||||
|
||||
DEFAULT_SIMILAR_THRESHOLD = 0.8 # 默认相似度阈值
|
||||
DEFAULT_TASK_UUID = "uuid" # 默认任务UUID
|
||||
DEFAULT_ORB_RATIO = 0.8 # 默认特征点距离比率
|
||||
DEFAULT_MIX_SIMILARITY = 0.75 # 默认相似度算法阈值
|
||||
DEFAULT_IMG_RESIZE = 200 # 默认图片压缩尺寸
|
||||
DEFAULT_PAGE_SIZE = 500 # 默认每页数据量
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.similar_threshold = kwargs.get("similarThreshold", self.DEFAULT_SIMILAR_THRESHOLD) # 默认相似度阈值为0.8
|
||||
# task_uuid为标识该数据集的唯一标志
|
||||
self.task_uuid = kwargs.get("uuid", self.DEFAULT_TASK_UUID)
|
||||
self.orb_ratio = self.DEFAULT_ORB_RATIO # 特征点距离的比率,该数值为经验值
|
||||
self.mix_similarity = self.DEFAULT_MIX_SIMILARITY # 选择相似度算法的阈值,该数值为经验值
|
||||
self.img_resize = self.DEFAULT_IMG_RESIZE # 图片压缩尺寸
|
||||
self.conn = None # 数据库连接
|
||||
self.trans = None # 数据库事务
|
||||
self.page_size = self.DEFAULT_PAGE_SIZE # 每页数据量
|
||||
# 获取数据库sql
|
||||
self.sql_dict = self.load_sql_dict()
|
||||
|
||||
@staticmethod
|
||||
def load_sql_dict():
|
||||
"""获取sql语句"""
|
||||
sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
|
||||
with open(sql_config_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
@staticmethod
|
||||
def get_p_hash(image: np.ndarray) -> str:
|
||||
"""计算pHash值"""
|
||||
hashed_value = ""
|
||||
if not image.size:
|
||||
return hashed_value
|
||||
gray_image = cv2.cvtColor(cv2.resize(image, (8, 8), interpolation=cv2.INTER_AREA), cv2.COLOR_BGR2GRAY)
|
||||
dct_image = cv2.dct(np.float32(gray_image))
|
||||
hashed_value = ''.join(['1' if x >= 0 else '0' for x in dct_image[:8, :8].flatten()])
|
||||
return hashed_value
|
||||
|
||||
@staticmethod
|
||||
def get_phash_similarity(hash_comparison: str, hash_compared: str) -> float:
|
||||
"""通过计算汉明距离,获取图片相似度"""
|
||||
# 若哈希值为空,则相似度为0
|
||||
if not hash_comparison or not hash_compared:
|
||||
return 0.0
|
||||
# 计算汉明距离
|
||||
distance = sum(
|
||||
bit_comparison != bit_compared for bit_comparison, bit_compared in zip(hash_comparison, hash_compared))
|
||||
similarity = 1 - distance / len(hash_comparison)
|
||||
return similarity
|
||||
|
||||
def filter_similar_images(self, img: np.ndarray, file_name: str) -> np.ndarray:
|
||||
"""判断数据集中是否存在相似图片"""
|
||||
# 如果文件为空,则无需去重,返回原图
|
||||
if not img.size:
|
||||
return img
|
||||
p_hash = self.get_p_hash(img)
|
||||
height, width = img.shape[:2] # 获取原图像的水平方向尺寸和垂直方向尺寸。
|
||||
img_resize = cv2.resize(img, (int(width / height * self.img_resize), self.img_resize),
|
||||
interpolation=cv2.INTER_AREA)
|
||||
des_matrix = get_orb_des(img_resize)
|
||||
return self.execute_sql(p_hash, des_matrix, file_name, img)
|
||||
|
||||
def get_orb_similarity(self, des_matrix: np.ndarray, des_matrix_history: np.ndarray, file_name: str,
|
||||
file_name_history: str) -> float:
|
||||
"""获取图片orb相似度"""
|
||||
# 若描述符矩阵为空,则相似度为0
|
||||
if not des_matrix.size or not des_matrix_history.size:
|
||||
return 0.0
|
||||
# 根据矩阵对角线上元素和的大小,选择描述符矩阵作为训练或查询矩阵
|
||||
train_matrix, query_matrix = des_matrix, des_matrix_history
|
||||
if train_matrix.shape[0] > des_matrix_history.shape[0]:
|
||||
train_matrix, query_matrix = des_matrix_history, des_matrix
|
||||
elif des_matrix.shape[0] == des_matrix_history.shape[0]:
|
||||
if np.trace(des_matrix) > np.trace(des_matrix_history):
|
||||
train_matrix, query_matrix = des_matrix_history, des_matrix
|
||||
|
||||
try:
|
||||
# knn筛选结果
|
||||
matches = (cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False).
|
||||
knnMatch(query_matrix, trainDescriptors=train_matrix, k=2))
|
||||
if not matches:
|
||||
return 0.0
|
||||
# 遍历每一对特征点,筛选距离更近的特征点
|
||||
count = 0
|
||||
for (m, n) in matches:
|
||||
if m.distance < self.orb_ratio * n.distance:
|
||||
count += 1
|
||||
orb_similarity = count / len(matches)
|
||||
return orb_similarity
|
||||
except Exception as e:
|
||||
logger.exception(f"taskId: {self.task_uuid}, failed to compare the similarity between "
|
||||
f"{file_name} and {file_name_history}: {e}")
|
||||
return 0.0
|
||||
|
||||
def execute_sql(self, p_hash: str, des_matrix: np.ndarray, file_name: str,
|
||||
img: np.ndarray) -> np.ndarray:
|
||||
des_matrix_binary = zlib.compress(des_matrix.tobytes()) # 使用 zlib 进行压缩数组
|
||||
timestamp = get_now_time('Asia/Shanghai', '%Y-%m-%d %H:%M:%S', file_name,
|
||||
"ImgSimilarCleaner")
|
||||
query_task_uuid_sql = str(self.sql_dict.get("query_task_uuid_sql"))
|
||||
insert_sql = str(self.sql_dict.get("insert_sql"))
|
||||
create_tables_sql = str(self.sql_dict.get("create_tables_sql"))
|
||||
|
||||
db_manager = SQLManager()
|
||||
try:
|
||||
self.conn = db_manager.create_connect()
|
||||
except Exception as e:
|
||||
logger.error(f"fileName: {file_name}, database connection failed: {str(e)}")
|
||||
raise RuntimeError(82000, str(e)) from None
|
||||
|
||||
with self.conn as connection:
|
||||
"""从数据库中获取文件特征、比较相似度,插入新的文件特征"""
|
||||
connection.execute(text(create_tables_sql))
|
||||
result = connection.execute(text(query_task_uuid_sql), {"task_uuid": self.task_uuid}).fetchall()
|
||||
total_count = len(result)
|
||||
if self.has_similar_images(connection, des_matrix, file_name, p_hash, total_count):
|
||||
return np.array([])
|
||||
|
||||
insert_data = {
|
||||
"task_uuid": self.task_uuid,
|
||||
"p_hash": p_hash,
|
||||
"des_matrix": des_matrix_binary,
|
||||
"matrix_shape": str(des_matrix.shape),
|
||||
"file_name": file_name.encode("utf-8").hex(),
|
||||
"timestamp": timestamp
|
||||
}
|
||||
connection.execute(text(insert_sql),insert_data)
|
||||
return img
|
||||
|
||||
def has_similar_images(self, connection, des_matrix, file_name, p_hash, total_count):
|
||||
for i in range(0, total_count, self.page_size):
|
||||
query_sql = self.sql_dict.get("query_sql")
|
||||
rows = connection.execute(text(query_sql), {"task_uuid": self.task_uuid, "ge": self.page_size, "le": i}).fetchall()
|
||||
# 对应任务uuid,最后一页没有数据,跳出循环
|
||||
if not rows:
|
||||
break # 对两张图片进行相似度比较
|
||||
if self.determine_similar_images(rows, p_hash, des_matrix, file_name):
|
||||
return True
|
||||
return False
|
||||
|
||||
def determine_similar_images(self, file_features: List, p_hash: str, des_matrix: np.ndarray,
|
||||
file_name: str) -> bool:
|
||||
"""根据文件特征,判断两张图片相似度是否超过指定阈值"""
|
||||
for signature in file_features:
|
||||
pash_feature, orb_feature, matrix_shape, file_name_history = signature[2], signature[3], signature[4], \
|
||||
signature[5]
|
||||
if not pash_feature:
|
||||
# 若图片为空,p_hash、des_matrix为空,跳过比对
|
||||
continue
|
||||
# 解压缩数据
|
||||
decompressed_data = zlib.decompress(orb_feature)
|
||||
# 将字节流转换回矩阵
|
||||
des_matrix_history = np.frombuffer(decompressed_data, dtype=np.uint8).reshape(eval(matrix_shape))
|
||||
# 移除转义字符 '\' 并将十六进制字符串转换为字节序列
|
||||
bytes_data = bytes.fromhex(file_name_history)
|
||||
# 解码字节序列为 UTF-8 编码的字符串
|
||||
file_name_decoded = bytes_data.decode('utf-8')
|
||||
|
||||
phash_similarity = self.get_phash_similarity(p_hash, pash_feature)
|
||||
orb_similarity = self.get_orb_similarity(des_matrix, des_matrix_history, file_name, file_name_decoded)
|
||||
max_similarity = max(phash_similarity, orb_similarity)
|
||||
min_similarity = min(phash_similarity, orb_similarity)
|
||||
if max_similarity >= self.mix_similarity:
|
||||
result = max_similarity
|
||||
else:
|
||||
result = min_similarity
|
||||
similarity = round(result, 2)
|
||||
if similarity >= self.similar_threshold:
|
||||
logger.info(
|
||||
"fileName: %s, method: ImgSimilarCleaner, dataset: %s. This picture is similar to %s, "
|
||||
"and the similarity is %.4f. The picture is filtered.", file_name, self.task_uuid,
|
||||
file_name_decoded, similarity)
|
||||
return True
|
||||
return False
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""去除相似图片算子执行入口"""
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])
|
||||
similar_images = self.filter_similar_images(data, file_name)
|
||||
# 若相似图片,sample[self.data_key]设为空
|
||||
if not similar_images.size:
|
||||
sample[self.data_key] = b""
|
||||
logger.info(f"fileName: {file_name}, method: ImgSimilarCleaner costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"query_sql": "SELECT * FROM operator_similar_img_features WHERE task_uuid = :task_uuid ORDER BY timestamp LIMIT :ge OFFSET :le",
|
||||
"insert_sql": "INSERT INTO operator_similar_img_features (task_uuid,p_hash,des_matrix,matrix_shape,file_name,timestamp) VALUES (:task_uuid,:p_hash,:des_matrix,:matrix_shape,:file_name,:timestamp)",
|
||||
"query_task_uuid_sql": "SELECT * FROM operator_similar_img_features WHERE task_uuid = :task_uuid",
|
||||
"create_tables_sql": "CREATE TABLE IF NOT EXISTS operator_similar_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),p_hash TEXT,des_matrix BLOB,matrix_shape TEXT,file_name TEXT,timestamp DATETIME);"
|
||||
}
|
||||
6
runtime/ops/filter/remove_duplicate_file/__init__.py
Normal file
6
runtime/ops/filter/remove_duplicate_file/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='DuplicateFilesFilter',
|
||||
module_path="ops.filter.remove_duplicate_file.process")
|
||||
25
runtime/ops/filter/remove_duplicate_file/metadata.yml
Normal file
25
runtime/ops/filter/remove_duplicate_file/metadata.yml
Normal file
@@ -0,0 +1,25 @@
|
||||
name: '相似文档去除'
|
||||
name_en: 'Similar Document Removal'
|
||||
description: '相似文档去除。'
|
||||
description_en: 'Removes similar documents.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'DuplicateFilesFilter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '这篇文档跟数据集中的另一篇文档内容几乎一样,执行该算子后,这篇文档会被去除。'
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
settings:
|
||||
fileDuplicateThreshold:
|
||||
name: 文档相似度
|
||||
description: 基于MinHash算法和Jaccard相似度,计算当前文档与数据集中其它文档相似性,超过设定值,该文档被去除。
|
||||
type: slider
|
||||
defaultVal: 0.5
|
||||
min: 0
|
||||
max: 1
|
||||
step: 0.1
|
||||
158
runtime/ops/filter/remove_duplicate_file/process.py
Normal file
158
runtime/ops/filter/remove_duplicate_file/process.py
Normal file
@@ -0,0 +1,158 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 文档局部内容去重
|
||||
Create: 2025/01/07
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
|
||||
import numpy as np
|
||||
from datasketch import MinHash
|
||||
from sqlalchemy import text
|
||||
from loguru import logger
|
||||
|
||||
from datamate.sql_manager.sql_manager import SQLManager
|
||||
from datamate.common.utils import get_now_time
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
|
||||
class DuplicateFilesFilter(Filter):
|
||||
"""相似文档去除插件
|
||||
|
||||
基于MinHash计算当前文档与数据集中其它文档相似性,相似性高于设定阈值则返回空。
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# 标点符号
|
||||
super().__init__(*args, **kwargs)
|
||||
self.punctuation_pattern = "。.??!!,,;;::()()【】{}[]“”""‘’''/\n"
|
||||
# 默认相似度阈值为0.5
|
||||
self.duplicate_th = kwargs.get("fileDuplicateThreshold", 0.5)
|
||||
# task_uuid为标识该数据集的唯一标志
|
||||
self.task_uuid = kwargs.get("uuid", "")
|
||||
# 数据库连接
|
||||
self.conn = None
|
||||
# 数据库事务
|
||||
self.trans = None
|
||||
# 每页数据量
|
||||
self.page_size = 500
|
||||
# 获取数据库sql
|
||||
self.sql_dict = self.load_sql_dict()
|
||||
|
||||
@staticmethod
|
||||
def load_sql_dict():
|
||||
"""获取sql语句"""
|
||||
sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
|
||||
with open(sql_config_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
def get_minhash(self, input_text: str) -> MinHash:
|
||||
"""获取输入文档的minhash
|
||||
|
||||
Args:
|
||||
input_text: 输入文档内容
|
||||
|
||||
Returns:
|
||||
text_minhash: 输入文档对应的minhash值
|
||||
"""
|
||||
text_minhash = MinHash()
|
||||
for word in re.split(f"[{re.escape(self.punctuation_pattern)}]", input_text.strip()):
|
||||
text_minhash.update(word.strip().encode('utf8'))
|
||||
return text_minhash
|
||||
|
||||
def deduplicate_files(self, sample: Dict[str, Any], file_name: str) -> str:
|
||||
"""去除相似文件
|
||||
|
||||
Args:
|
||||
content: 待处理的Content对象
|
||||
file_name: 文件名称
|
||||
|
||||
Returns:
|
||||
input_text: 去重后的文件内容,大于相似度值返回空,否则返回原始文本内容。
|
||||
"""
|
||||
input_text = sample[self.text_key]
|
||||
if not input_text:
|
||||
return input_text
|
||||
text_minhash = self.get_minhash(input_text)
|
||||
return self.execute_sql(text_minhash, file_name, input_text)
|
||||
|
||||
def execute_sql(self, text_minhash: MinHash, file_name: str,
|
||||
input_text: str) -> str:
|
||||
"""从数据库中获取文件特征、比较相似度,插入新的文件特征"""
|
||||
timestamp = get_now_time('Asia/Shanghai', '%Y-%m-%d %H:%M:%S', file_name,
|
||||
"DuplicateFilesFilter")
|
||||
minhash_values = text_minhash.hashvalues
|
||||
# 将 NumPy 数组转换为字符串
|
||||
minhash_values_string = np.array2string(minhash_values)
|
||||
query_task_uuid_sql = self.sql_dict.get("query_task_uuid_sql")
|
||||
insert_sql = self.sql_dict.get("insert_sql")
|
||||
create_tables_sql = self.sql_dict.get("create_tables_sql")
|
||||
db_manager = SQLManager()
|
||||
try:
|
||||
self.conn = db_manager.create_connect()
|
||||
except Exception as e:
|
||||
logger.error(f"fileName: {file_name}, database connection failed: {str(e)}")
|
||||
raise RuntimeError(82000, str(e)) from None
|
||||
with self.conn as connection:
|
||||
connection.execute(text(create_tables_sql))
|
||||
result = connection.execute(text(query_task_uuid_sql), {"task_uuid": self.task_uuid}).fetchall()
|
||||
total_count = len(result)
|
||||
if self.has_similar_text(connection, file_name, text_minhash, total_count):
|
||||
return ""
|
||||
insert_data = {
|
||||
"task_uuid": self.task_uuid,
|
||||
"file_feature": minhash_values_string,
|
||||
"file_name": file_name.encode("utf-8").hex(),
|
||||
"timestamp": timestamp
|
||||
}
|
||||
connection.execute(text(insert_sql), insert_data)
|
||||
return input_text
|
||||
|
||||
def has_similar_text(self, connection, file_name, text_minhash, total_count) -> bool:
|
||||
query_sql = self.sql_dict.get("query_sql")
|
||||
for i in range(0, total_count, self.page_size):
|
||||
rows = connection.execute(
|
||||
text(query_sql), {"task_uuid": self.task_uuid, "ge": self.page_size, "le": i}).fetchall()
|
||||
# 对应任务uuid,最后一页没有数据,跳出循环
|
||||
if not rows:
|
||||
break
|
||||
# 对两个文本进行相似度比较
|
||||
if self.determine_similar_text(rows, text_minhash, file_name):
|
||||
return True
|
||||
return False
|
||||
|
||||
def determine_similar_text(self, file_features: List, text_minhash: MinHash, file_name: str) -> bool:
|
||||
for signature in file_features:
|
||||
# 历史文件特征和历史文件名称
|
||||
file_feature, file_name_history = signature[2], signature[3]
|
||||
if not file_feature:
|
||||
continue
|
||||
minhash_obj = MinHash(num_perm=128)
|
||||
minhash_obj.hashvalues = np.fromstring(file_feature.strip('[]'), dtype=np.uint64, sep=' ')
|
||||
similarity = text_minhash.jaccard(minhash_obj)
|
||||
|
||||
# 移除转义字符 '\' 并将十六进制字符串转换为字节序列
|
||||
bytes_data = bytes.fromhex(file_name_history)
|
||||
# 解码字节序列为 UTF-8 编码的字符串
|
||||
file_name_decoded = bytes_data.decode('utf-8')
|
||||
|
||||
if similarity >= self.duplicate_th:
|
||||
logger.info(f"taskId: {self.task_uuid}, fileName: {file_name} is similar to {file_name_decoded}, "
|
||||
f"and the similarity is {similarity:4f}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
||||
sample[self.text_key] = self.deduplicate_files(sample, file_name)
|
||||
logger.info(f"taskId: {self.task_uuid} fileName: {file_name}, "
|
||||
f"method: DuplicateFilesFilter costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"query_sql": "SELECT * FROM operators_similar_text_features WHERE task_uuid = :task_uuid ORDER BY timestamp LIMIT :ge OFFSET :le",
|
||||
"create_tables_sql": "CREATE TABLE IF NOT EXISTS operators_similar_text_features (id INT AUTO_INCREMENT PRIMARY KEY, task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);",
|
||||
"insert_sql": "INSERT INTO operators_similar_text_features (task_uuid, file_feature, file_name, timestamp) VALUES (:task_uuid, :file_feature, :file_name, :timestamp)",
|
||||
"query_task_uuid_sql": "SELECT * FROM operators_similar_text_features WHERE task_uuid = :task_uuid"
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FileWithManySensitiveWordsFilter',
|
||||
module_path="ops.filter.remove_file_with_many_sensitive_words.process")
|
||||
@@ -0,0 +1,25 @@
|
||||
name: '文档敏感词率检查'
|
||||
name_en: 'Sensitive Word Rate Check'
|
||||
description: '去除敏感词过多的文档。'
|
||||
description_en: 'Filters out files that contain excessive sensitive phrases.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'FileWithManySensitiveWordsFilter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '出售硝酸甘油出售硝酸甘油出售硝酸甘油出售硝酸甘油'
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
settings:
|
||||
sensitiveWordsRate:
|
||||
name: 文档敏感词率
|
||||
description: 敏感词的字数/文档总字数 > 设定值,该文档被去除。
|
||||
type: slider
|
||||
defaultVal: 0.01
|
||||
min: 0
|
||||
max: 1
|
||||
step: 0.01
|
||||
@@ -0,0 +1,116 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 过滤语言概率太低的文档(支持自定义阈值)
|
||||
Create: 2023/12/7 15:43
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Filter
|
||||
from datamate.common.utils.aho_corasick import build_trie, add_fail_pointer
|
||||
|
||||
sys.setrecursionlimit(5000)
|
||||
|
||||
|
||||
class AhoCorasic:
|
||||
"""AC自动机算法进行目标字符串搜索"""
|
||||
|
||||
def __init__(self, words):
|
||||
self._root = add_fail_pointer(build_trie(words))
|
||||
|
||||
def search_and_count(self, text: str, special_symbols: set):
|
||||
"""
|
||||
匹配敏感词,统计敏感词字数。
|
||||
|
||||
Args:
|
||||
text: 文本
|
||||
special_symbols: 特殊字符(需跳过)
|
||||
Returns:
|
||||
统计敏感词字数
|
||||
"""
|
||||
target_count = 0
|
||||
node = self._root
|
||||
|
||||
valid_len = 0 # 当前遍历的有效长度
|
||||
for _, s in enumerate(text):
|
||||
if s in special_symbols: # 跳过特殊字符
|
||||
continue
|
||||
|
||||
matched = True
|
||||
while s not in node.child: # 当node.child没有字符s
|
||||
if node == self._root: # 当node为root(无node.fail),有效长度归0且跳出
|
||||
valid_len = 0
|
||||
matched = False
|
||||
break
|
||||
elif node.fail == self._root: # node.fail为root场景,有效长度归0,但可继续
|
||||
valid_len = 0
|
||||
node = node.fail # 移动到失败指针节点
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
node = node.child.get(s)
|
||||
valid_len += 1
|
||||
if node.word: # node是单词尾字母
|
||||
target_count += valid_len
|
||||
valid_len = 0
|
||||
return target_count
|
||||
|
||||
|
||||
class FileWithManySensitiveWordsFilter(Filter):
|
||||
"""外部输入的暴力、色情文本过滤插件"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(FileWithManySensitiveWordsFilter, self).__init__(*args, **kwargs)
|
||||
root_path = Path(__file__).parent / 'resources'
|
||||
violent_file_path = str(root_path / 'violent.txt')
|
||||
sexual_file_path = str(root_path / 'sexual.txt')
|
||||
political_file_path = str(root_path / 'political.txt')
|
||||
special_symbols_path = str(root_path / 'special_symbols.txt')
|
||||
self._file_sensitive_words_rate = kwargs.get("sensitiveWordsRate", 0.01) # 参数默认值为0.01
|
||||
self.violent_words = self.load_words_list(violent_file_path)
|
||||
self.sexual_words = self.load_words_list(sexual_file_path)
|
||||
self.political_words = self.load_words_list(political_file_path)
|
||||
self.special_symbols = self.load_words_list(special_symbols_path)
|
||||
self.symbols = self.special_symbols | {"\n", "\t", "\r"} # 符号,不纳入文本字数统计
|
||||
self.words = self.violent_words | self.sexual_words | self.political_words
|
||||
self.ac_automaton = AhoCorasic(self.words)
|
||||
|
||||
@staticmethod
|
||||
def load_words_list(path):
|
||||
"""词表加载"""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
words = set(f.read().splitlines())
|
||||
return words
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._file_with_many_sensitive_words_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileWithManySensitiveWordsFilter costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
def _file_with_many_sensitive_words_filter(self, input_data: str, file_name):
|
||||
"""过滤敏感词过多的文档"""
|
||||
total_count = 0
|
||||
for s in input_data:
|
||||
if s not in self.symbols:
|
||||
total_count += 1
|
||||
if total_count == 0:
|
||||
return input_data
|
||||
|
||||
# 敏感词率 = 敏感词字数 / 总字数,符号不纳入统计
|
||||
sensitive_rate = self.ac_automaton.search_and_count(input_data, self.special_symbols) / total_count
|
||||
if sensitive_rate >= self._file_sensitive_words_rate:
|
||||
logger.info(f"This document contains too many sensitive words. "
|
||||
f"The proportion of sensitive words is {sensitive_rate}. "
|
||||
f"Threshold is {self._file_sensitive_words_rate}. The document {file_name} is filtered.")
|
||||
return ""
|
||||
else:
|
||||
return input_data
|
||||
@@ -0,0 +1,321 @@
|
||||
习近平
|
||||
平近习
|
||||
xjp
|
||||
习太子
|
||||
习明泽
|
||||
老习
|
||||
温家宝
|
||||
温加宝
|
||||
温x
|
||||
温jia宝
|
||||
温宝宝
|
||||
温加饱
|
||||
温加保
|
||||
张培莉
|
||||
温云松
|
||||
温如春
|
||||
温jb
|
||||
胡温
|
||||
胡x
|
||||
胡jt
|
||||
胡boss
|
||||
胡总
|
||||
胡王八
|
||||
hujintao
|
||||
胡jintao
|
||||
胡j涛
|
||||
胡惊涛
|
||||
胡景涛
|
||||
胡紧掏
|
||||
湖紧掏
|
||||
胡紧套
|
||||
锦涛
|
||||
hjt
|
||||
胡派
|
||||
胡主席
|
||||
刘永清
|
||||
胡海峰
|
||||
胡海清
|
||||
江泽民
|
||||
民泽江
|
||||
江胡
|
||||
江主席
|
||||
江书记
|
||||
江浙闽
|
||||
江沢民
|
||||
江浙民
|
||||
茳泽民
|
||||
zemin
|
||||
ze民
|
||||
老江
|
||||
老j
|
||||
江core
|
||||
江x
|
||||
江派
|
||||
江zm
|
||||
jzm
|
||||
江戏子
|
||||
江蛤蟆
|
||||
江某某
|
||||
江贼
|
||||
江猪
|
||||
江氏集团
|
||||
江绵恒
|
||||
江绵康
|
||||
王冶坪
|
||||
江泽慧
|
||||
邓小平
|
||||
平小邓
|
||||
xiao平
|
||||
邓xp
|
||||
邓晓平
|
||||
邓朴方
|
||||
邓榕
|
||||
邓质方
|
||||
毛泽东
|
||||
猫泽东
|
||||
猫则东
|
||||
猫贼洞
|
||||
毛zd
|
||||
毛zx
|
||||
z东
|
||||
ze东
|
||||
泽d
|
||||
zedong
|
||||
毛太祖
|
||||
毛相
|
||||
主席画像
|
||||
改革历程
|
||||
朱镕基
|
||||
朱容基
|
||||
朱镕鸡
|
||||
朱容鸡
|
||||
朱云来
|
||||
李鹏
|
||||
李peng
|
||||
里鹏
|
||||
李月月鸟
|
||||
李小鹏
|
||||
李小琳
|
||||
华主席
|
||||
华国
|
||||
国锋
|
||||
国峰
|
||||
锋同志
|
||||
白春礼
|
||||
薄熙来
|
||||
薄一波
|
||||
蔡赴朝
|
||||
蔡武
|
||||
曹刚川
|
||||
常万全
|
||||
陈炳德
|
||||
陈德铭
|
||||
陈建国
|
||||
陈良宇
|
||||
陈绍基
|
||||
陈同海
|
||||
陈至立
|
||||
戴秉国
|
||||
丁一平
|
||||
董建华
|
||||
杜德印
|
||||
杜世成
|
||||
傅锐
|
||||
郭伯雄
|
||||
郭金龙
|
||||
贺国强
|
||||
胡春华
|
||||
耀邦
|
||||
华建敏
|
||||
黄华华
|
||||
黄丽满
|
||||
黄兴国
|
||||
回良玉
|
||||
贾庆林
|
||||
贾廷安
|
||||
靖志远
|
||||
李长春
|
||||
李春城
|
||||
李建国
|
||||
李克强
|
||||
李岚清
|
||||
李沛瑶
|
||||
李荣融
|
||||
李瑞环
|
||||
李铁映
|
||||
李先念
|
||||
李学举
|
||||
李源潮
|
||||
栗智
|
||||
梁光烈
|
||||
廖锡龙
|
||||
林树森
|
||||
林炎志
|
||||
林左鸣
|
||||
令计划
|
||||
柳斌杰
|
||||
刘奇葆
|
||||
刘少奇
|
||||
刘延东
|
||||
刘云山
|
||||
刘志军
|
||||
龙新民
|
||||
路甬祥
|
||||
罗箭
|
||||
吕祖善
|
||||
马飚
|
||||
马恺
|
||||
孟建柱
|
||||
欧广源
|
||||
强卫
|
||||
沈跃跃
|
||||
宋平顺
|
||||
粟戎生
|
||||
苏树林
|
||||
孙家正
|
||||
铁凝
|
||||
屠光绍
|
||||
王东明
|
||||
汪东兴
|
||||
王鸿举
|
||||
王沪宁
|
||||
王乐泉
|
||||
王洛林
|
||||
王岐山
|
||||
王胜俊
|
||||
王太华
|
||||
王学军
|
||||
王兆国
|
||||
王振华
|
||||
吴邦国
|
||||
吴定富
|
||||
吴官正
|
||||
无官正
|
||||
吴胜利
|
||||
吴仪
|
||||
奚国华
|
||||
习仲勋
|
||||
徐才厚
|
||||
许其亮
|
||||
徐绍史
|
||||
杨洁篪
|
||||
叶剑英
|
||||
由喜贵
|
||||
于幼军
|
||||
俞正声
|
||||
袁纯清
|
||||
曾培炎
|
||||
曾庆红
|
||||
曾宪梓
|
||||
曾荫权
|
||||
张德江
|
||||
张定发
|
||||
张高丽
|
||||
张立昌
|
||||
张荣坤
|
||||
张志国
|
||||
赵洪祝
|
||||
紫阳
|
||||
周生贤
|
||||
周永康
|
||||
朱海仑
|
||||
中南海
|
||||
大陆当局
|
||||
中国当局
|
||||
北京当局
|
||||
共产党
|
||||
党产共
|
||||
共贪党
|
||||
阿共
|
||||
产党共
|
||||
公产党
|
||||
工产党
|
||||
共c党
|
||||
共x党
|
||||
共铲
|
||||
供产
|
||||
共惨
|
||||
供铲党
|
||||
供铲谠
|
||||
供铲裆
|
||||
共残党
|
||||
共残主义
|
||||
共产主义的幽灵
|
||||
拱铲
|
||||
老共
|
||||
中共
|
||||
中珙
|
||||
中gong
|
||||
gc党
|
||||
贡挡
|
||||
gong党
|
||||
g产
|
||||
狗产蛋
|
||||
共残裆
|
||||
恶党
|
||||
邪党
|
||||
共产专制
|
||||
共产王朝
|
||||
裆中央
|
||||
土共
|
||||
土g
|
||||
共狗
|
||||
g匪
|
||||
共匪
|
||||
仇共
|
||||
症腐
|
||||
政腐
|
||||
政付
|
||||
正府
|
||||
政俯
|
||||
政f
|
||||
zhengfu
|
||||
政zhi
|
||||
挡中央
|
||||
档中央
|
||||
中国zf
|
||||
中央zf
|
||||
国wu院
|
||||
中华帝国
|
||||
gong和
|
||||
大陆官方
|
||||
北京政权
|
||||
江泽民
|
||||
胡锦涛
|
||||
温家宝
|
||||
习近平
|
||||
习仲勋
|
||||
贺国强
|
||||
贺子珍
|
||||
周永康
|
||||
李长春
|
||||
李德生
|
||||
王岐山
|
||||
姚依林
|
||||
回良玉
|
||||
李源潮
|
||||
李干成
|
||||
戴秉国
|
||||
黄镇
|
||||
刘延东
|
||||
刘瑞龙
|
||||
俞正声
|
||||
黄敬
|
||||
薄熙
|
||||
薄一波
|
||||
周小川
|
||||
周建南
|
||||
温云松
|
||||
徐明
|
||||
江泽慧
|
||||
江绵恒
|
||||
江绵康
|
||||
李小鹏
|
||||
李鹏
|
||||
李小琳
|
||||
朱云来
|
||||
朱容基
|
||||
法轮功
|
||||
李洪志
|
||||
新疆骚乱
|
||||
@@ -0,0 +1,288 @@
|
||||
爱液
|
||||
按摩棒
|
||||
拔出来
|
||||
爆草
|
||||
包二奶
|
||||
暴干
|
||||
暴奸
|
||||
暴乳
|
||||
爆乳
|
||||
暴淫
|
||||
被操
|
||||
被插
|
||||
被干
|
||||
逼奸
|
||||
仓井空
|
||||
插暴
|
||||
操逼
|
||||
操黑
|
||||
操烂
|
||||
肏你
|
||||
肏死
|
||||
操死
|
||||
操我
|
||||
厕奴
|
||||
插比
|
||||
插b
|
||||
插逼
|
||||
插进
|
||||
插你
|
||||
插我
|
||||
插阴
|
||||
潮吹
|
||||
潮喷
|
||||
成人电影
|
||||
成人论坛
|
||||
成人色情
|
||||
成人网站
|
||||
成人文学
|
||||
成人小说
|
||||
艳情小说
|
||||
成人游戏
|
||||
吃精
|
||||
抽插
|
||||
春药
|
||||
大波
|
||||
大力抽送
|
||||
大乳
|
||||
荡妇
|
||||
荡女
|
||||
盗撮
|
||||
发浪
|
||||
放尿
|
||||
肥逼
|
||||
粉穴
|
||||
干死你
|
||||
干穴
|
||||
肛交
|
||||
肛门
|
||||
龟头
|
||||
裹本
|
||||
国产av
|
||||
豪乳
|
||||
黑逼
|
||||
后穴
|
||||
虎骑
|
||||
换妻俱乐部
|
||||
黄片
|
||||
几吧
|
||||
鸡吧
|
||||
鸡巴
|
||||
鸡奸
|
||||
妓女
|
||||
奸情
|
||||
叫床
|
||||
脚交
|
||||
精液
|
||||
就去日
|
||||
巨屌
|
||||
菊花洞
|
||||
菊门
|
||||
巨奶
|
||||
巨乳
|
||||
菊穴
|
||||
开苞
|
||||
口爆
|
||||
口活
|
||||
口交
|
||||
口射
|
||||
口淫
|
||||
狂操
|
||||
狂插
|
||||
浪逼
|
||||
浪妇
|
||||
浪叫
|
||||
浪女
|
||||
漏乳
|
||||
露b
|
||||
乱交
|
||||
乱伦
|
||||
轮暴
|
||||
轮操
|
||||
轮奸
|
||||
裸陪
|
||||
买春
|
||||
美逼
|
||||
美少妇
|
||||
美乳
|
||||
美腿
|
||||
美穴
|
||||
美幼
|
||||
秘唇
|
||||
迷奸
|
||||
密穴
|
||||
蜜穴
|
||||
蜜液
|
||||
摸奶
|
||||
摸胸
|
||||
母奸
|
||||
奈美
|
||||
奶子
|
||||
男奴
|
||||
内射
|
||||
嫩逼
|
||||
嫩女
|
||||
嫩穴
|
||||
捏弄
|
||||
女优
|
||||
炮友
|
||||
砲友
|
||||
喷精
|
||||
屁眼
|
||||
前凸后翘
|
||||
强jian
|
||||
强暴
|
||||
强奸处女
|
||||
情趣用品
|
||||
情色
|
||||
拳交
|
||||
全裸
|
||||
群交
|
||||
人妻
|
||||
人兽
|
||||
日逼
|
||||
日烂
|
||||
肉棒
|
||||
肉逼
|
||||
肉唇
|
||||
肉洞
|
||||
肉缝
|
||||
肉棍
|
||||
肉茎
|
||||
肉具
|
||||
揉乳
|
||||
肉穴
|
||||
肉欲
|
||||
乳爆
|
||||
乳房
|
||||
乳沟
|
||||
乳交
|
||||
乳头
|
||||
骚逼
|
||||
骚比
|
||||
骚女
|
||||
骚水
|
||||
骚穴
|
||||
色逼
|
||||
色情网站
|
||||
色区
|
||||
色色
|
||||
色诱
|
||||
色欲
|
||||
色b
|
||||
射爽
|
||||
射颜
|
||||
食精
|
||||
释欲
|
||||
兽奸
|
||||
兽交
|
||||
手淫
|
||||
兽欲
|
||||
熟妇
|
||||
熟母
|
||||
熟女
|
||||
爽片
|
||||
双臀
|
||||
死逼
|
||||
丝袜
|
||||
丝诱
|
||||
松岛枫
|
||||
酥痒
|
||||
汤加丽
|
||||
套弄
|
||||
体奸
|
||||
体位
|
||||
舔脚
|
||||
舔阴
|
||||
调教
|
||||
偷欢
|
||||
推油
|
||||
脱内裤
|
||||
文做
|
||||
舞女
|
||||
吸精
|
||||
夏川纯
|
||||
相奸
|
||||
小逼
|
||||
小穴
|
||||
小xue
|
||||
性感妖娆
|
||||
性感诱惑
|
||||
性虎
|
||||
性饥渴
|
||||
性技巧
|
||||
性交
|
||||
性奴
|
||||
性虐
|
||||
性息
|
||||
性欲
|
||||
穴口
|
||||
穴图
|
||||
亚情
|
||||
颜射
|
||||
阳具
|
||||
杨思敏
|
||||
要射了
|
||||
一夜欢
|
||||
一夜情
|
||||
一ye情
|
||||
阴部
|
||||
淫虫
|
||||
阴唇
|
||||
淫荡
|
||||
阴道
|
||||
淫电影
|
||||
阴阜
|
||||
淫妇
|
||||
淫河
|
||||
阴核
|
||||
阴户
|
||||
淫贱
|
||||
淫叫
|
||||
淫教师
|
||||
阴茎
|
||||
阴精
|
||||
淫浪
|
||||
淫媚
|
||||
淫糜
|
||||
淫魔
|
||||
淫母
|
||||
淫女
|
||||
淫虐
|
||||
淫妻
|
||||
淫情
|
||||
淫色
|
||||
淫声浪语
|
||||
淫兽学园
|
||||
淫书
|
||||
淫术炼金士
|
||||
淫水
|
||||
淫娃
|
||||
淫威
|
||||
淫亵
|
||||
淫样
|
||||
淫液
|
||||
淫照
|
||||
阴b
|
||||
应召
|
||||
幼交
|
||||
欲火
|
||||
欲女
|
||||
玉乳
|
||||
玉穴
|
||||
援交
|
||||
原味内衣
|
||||
援助交际
|
||||
招鸡
|
||||
招妓
|
||||
抓胸
|
||||
自慰
|
||||
作爱
|
||||
a片
|
||||
fuck
|
||||
gay片
|
||||
g点
|
||||
h动画
|
||||
h动漫
|
||||
失身粉
|
||||
淫荡自慰器
|
||||
@@ -0,0 +1,50 @@
|
||||
!
|
||||
|
||||
.
|
||||
,
|
||||
#
|
||||
$
|
||||
%
|
||||
&
|
||||
*
|
||||
(
|
||||
)
|
||||
|
|
||||
?
|
||||
/
|
||||
@
|
||||
"
|
||||
'
|
||||
;
|
||||
[
|
||||
]
|
||||
{
|
||||
}
|
||||
+
|
||||
~
|
||||
-
|
||||
_
|
||||
=
|
||||
^
|
||||
<
|
||||
>
|
||||
!
|
||||
。
|
||||
,
|
||||
¥
|
||||
(
|
||||
)
|
||||
?
|
||||
、
|
||||
“
|
||||
‘
|
||||
;
|
||||
【
|
||||
】
|
||||
——
|
||||
…
|
||||
……
|
||||
《
|
||||
》
|
||||
:
|
||||
:
|
||||
@@ -0,0 +1,452 @@
|
||||
出售雷管
|
||||
炸药
|
||||
出售炸药
|
||||
出售炸药电话
|
||||
制作火药配方
|
||||
炸药出售
|
||||
出售雷管炸药
|
||||
火药配方
|
||||
在家里做原子弹
|
||||
硝酸甘油炸弹制作
|
||||
TNT
|
||||
炸弹的制作
|
||||
硝铵炸药配方
|
||||
硝酸甘油制作
|
||||
炸弹配方
|
||||
简易炸药
|
||||
火药制作简易炸弹
|
||||
炸弹制作
|
||||
硝酸甘油
|
||||
硝酸甘油炸弹制作方法
|
||||
手把手教你做炸弹
|
||||
恐怖分子傻瓜手册
|
||||
氢弹手工制作方法
|
||||
起爆器
|
||||
自制燃烧弹
|
||||
燃烧弹
|
||||
弹药制造方法
|
||||
液体炸药配方
|
||||
炸药配方与制作
|
||||
自制炸药配方
|
||||
tnt炸药成分
|
||||
炸药的制作
|
||||
tnt炸药配方
|
||||
各种炸药配方大全
|
||||
炸药的成分
|
||||
燃烧弹的制作方
|
||||
黑火药的配方
|
||||
黑火药的制作
|
||||
黑火药比例
|
||||
制作燃烧弹
|
||||
黑火药配方比例
|
||||
黑火药的成分
|
||||
原子弹配方
|
||||
原子弹DIY教程
|
||||
C4炸药
|
||||
雷管出售
|
||||
制作硝化甘油
|
||||
DIY核弹
|
||||
核弹制作方法
|
||||
原子弹制作手册
|
||||
简易原子弹教程
|
||||
黑火药配方
|
||||
自制炸药方法
|
||||
教你怎么用土办法做武器
|
||||
zz炸弹的制作方法
|
||||
C4炸药的成分
|
||||
炸药制造生产工艺
|
||||
原子弹制造完全手册
|
||||
汽车炸弹制作
|
||||
人体炸弹制作流程
|
||||
炸弹制作从入门
|
||||
鸡尾酒炸弹制作
|
||||
集束炸弹制作
|
||||
硝酸铵的简便制取方法
|
||||
莫洛托夫燃烧弹
|
||||
塑胶炸弹
|
||||
用火药混合塑料
|
||||
高性能爆炸物质混合
|
||||
原子弹制作方法
|
||||
手教你制作原子弹
|
||||
轻型巡航导弹的制作
|
||||
黑索金的制造过程
|
||||
Zha药制造进程
|
||||
购买自爆材料
|
||||
可塑炸弹制作方法
|
||||
C4塑胶炸药
|
||||
水浴法制TNT
|
||||
简易核弹头的制造
|
||||
高爆炸药的基本配方
|
||||
如何制硝化甘油
|
||||
燃烧剂的简易制备
|
||||
简易大威力炸弹
|
||||
炸弹制作入门
|
||||
硝酸甘油炸药制作
|
||||
基本配方是硝酸甘
|
||||
燃烧剂的简易制作
|
||||
制作性能优良的燃烧弹
|
||||
酸甘油炸药
|
||||
简易导弹制作
|
||||
核弹头的制造
|
||||
可塑炸弹制作
|
||||
炸药硝酸甘油制法
|
||||
原子弹制作方法
|
||||
火药的配方
|
||||
手教你做原子弹
|
||||
怎么用土办法做武器
|
||||
手把手教你制作原子弹
|
||||
燃烧弹的制作方法
|
||||
怎么自制燃烧弹
|
||||
TNT炸弹制作
|
||||
高爆炸药基本配方
|
||||
爆破和杀伤燃烧弹
|
||||
炸弹制作从入门到精
|
||||
写的炸弹制作教程
|
||||
原子弹制作方法
|
||||
教你制作原子弹
|
||||
下硝化甘油的制作方法
|
||||
DIY原子弹
|
||||
原子弹制作
|
||||
炸药及其制造方法
|
||||
ATOM弹制造
|
||||
三分钟教你原子弹
|
||||
火药原理
|
||||
简易炸弹
|
||||
简易燃烧弹
|
||||
土炸药成份
|
||||
你怎么用土办法做武器
|
||||
燃烧弹制作
|
||||
燃烧瓶的制作方法
|
||||
炸弹制作手册
|
||||
原子弹制作方法DIY
|
||||
汽油三分之一白糖混合
|
||||
炸药的制作配方
|
||||
燃烧弹的制作方
|
||||
炸药的配方
|
||||
如何自制燃烧弹
|
||||
标准炸弹教程
|
||||
黑火药和硝酸钾农药混合
|
||||
恐怖分子自己制造炸弹
|
||||
手柄制作炸弹引爆器
|
||||
制作炸弹首先要
|
||||
燃烧瓶制作
|
||||
出售炸药
|
||||
出售气枪
|
||||
出售炸药QQ
|
||||
出售气枪QQ
|
||||
出售气枪电话
|
||||
出售手枪QQ
|
||||
出售手枪电话
|
||||
出售猎枪
|
||||
出售猎枪QQ
|
||||
出售猎枪电话
|
||||
出售汽枪
|
||||
出售步枪
|
||||
出售狙击枪
|
||||
出售手狗
|
||||
出售气狗
|
||||
出售猎狗
|
||||
手枪出售
|
||||
64式手枪出售
|
||||
92式手枪出售
|
||||
高仿真枪
|
||||
出售54式手枪
|
||||
出售M1911
|
||||
出售各类猎狗
|
||||
出售国产军用54手枪
|
||||
出售火枪游戏
|
||||
出售进口金属枪模
|
||||
出售仿真枪
|
||||
出售麻醉枪免收任何定金
|
||||
出售手枪
|
||||
出售外国枪模
|
||||
出售袖珍手枪
|
||||
娥眉气枪出售
|
||||
防身手枪QQ
|
||||
防身武器手枪
|
||||
仿真枪QQ
|
||||
钢珠枪小口径步枪
|
||||
高压狗QQ
|
||||
高压气枪
|
||||
气枪子弹
|
||||
高压气枪电狗专卖
|
||||
高压气枪
|
||||
气枪货到付款
|
||||
高压气枪专卖店
|
||||
各类军用枪
|
||||
各类军用枪支
|
||||
各式气枪出售
|
||||
工字牌气枪出售专卖
|
||||
气枪
|
||||
工字汽狗麻醉枪
|
||||
供应军用手枪
|
||||
供应汽枪
|
||||
购买枪支QQ
|
||||
国产汽狗专卖
|
||||
气枪货
|
||||
集团出售手枪
|
||||
集团出售手枪气枪
|
||||
进口气枪
|
||||
气枪子弹
|
||||
军用54出售
|
||||
军用77出售
|
||||
军用手枪钢珠枪
|
||||
里有汽枪出售
|
||||
猎枪出售
|
||||
猎枪仿真枪QQ
|
||||
猎枪麻醉枪钢珠枪
|
||||
猎枪哪里有卖
|
||||
猎枪手枪买卖
|
||||
猎枪销售手枪销售气枪
|
||||
猎枪专卖店
|
||||
猎枪子弹底火专卖
|
||||
麻醉枪出售可验货
|
||||
麻醉枪猎枪军用
|
||||
买卖54狗
|
||||
买卖64狗
|
||||
买卖警察枪支
|
||||
买卖军狗
|
||||
买卖军用枪支
|
||||
卖64手枪
|
||||
卖92手枪
|
||||
卖钢珠枪QQ
|
||||
哪里能买到自制手枪
|
||||
哪里有钢珠枪
|
||||
哪里有连发猎枪买卖
|
||||
哪里有卖双管猎
|
||||
哪里有气枪卖
|
||||
哪里有枪出售
|
||||
哪里有真枪出售
|
||||
哪里有真枪卖买
|
||||
哪能买到真枪
|
||||
出售手枪气枪猎
|
||||
气狗
|
||||
高压气枪
|
||||
气狗出售
|
||||
进口气狗
|
||||
气枪专卖网
|
||||
气枪出售
|
||||
气枪专卖网
|
||||
猎枪
|
||||
钢珠枪
|
||||
气枪仿真枪出售
|
||||
气枪购买
|
||||
气枪组装
|
||||
气枪货到付款
|
||||
汽狗出售
|
||||
气枪价格
|
||||
仿真气枪
|
||||
气枪价格
|
||||
气枪专卖店
|
||||
气枪看货加QQ
|
||||
气枪联系QQ
|
||||
气枪猎枪麻醉枪
|
||||
气枪迷魂药
|
||||
汽狗高压汽枪
|
||||
铅弹
|
||||
上海工字气枪
|
||||
全金属仿真枪专卖
|
||||
热武器QQ
|
||||
三箭气枪出售
|
||||
枪猎
|
||||
手槍买卖自制手枪
|
||||
售枪购买手枪
|
||||
售手枪QQ
|
||||
司马仿真枪模型QQ
|
||||
淘宝气枪货到付款
|
||||
提供气狗专卖店
|
||||
提供专业及时的气狗
|
||||
秃鹰气枪货到付款
|
||||
锡峰气枪出售
|
||||
仙鹤气枪出售
|
||||
买卖77式手枪
|
||||
销售电手狗
|
||||
销售气手狗
|
||||
销售小口径步枪
|
||||
销售运动步枪
|
||||
小口径秃鹰沙漠
|
||||
有军用枪支出售
|
||||
长期出售手枪
|
||||
折叠狗QQ
|
||||
转让猎枪
|
||||
子弹货到付款
|
||||
自制手枪哪里买
|
||||
自制手枪哪里有卖
|
||||
阻击枪
|
||||
汽枪
|
||||
高压气枪
|
||||
组装手枪
|
||||
最新销售汽狗
|
||||
出售警用手铐
|
||||
仿真枪
|
||||
猎枪
|
||||
真枪
|
||||
手枪
|
||||
阻击枪
|
||||
弓弩专营
|
||||
弓弩专卖网货到付款
|
||||
弓弩专卖网
|
||||
弓弩专卖货到付款
|
||||
弓弩专卖店网
|
||||
弓弩专卖店
|
||||
弓弩专卖
|
||||
弓弩制作
|
||||
弓弩直销
|
||||
弓弩有限公司
|
||||
弓弩营销中心
|
||||
弓弩一百专卖店
|
||||
弓弩销售
|
||||
弓弩网
|
||||
弓弩图纸
|
||||
弓弩特许经销
|
||||
弓弩狩猎网
|
||||
自制手弩
|
||||
追风弓弩麻醉箭专卖
|
||||
专业弓弩网
|
||||
中国战神军用弓弩
|
||||
中国弩弓专卖
|
||||
中国弓弩专卖网
|
||||
中国弓弩直销
|
||||
中国弓弩网
|
||||
中国弓弩狩猎网
|
||||
中国弓驽网
|
||||
制作简易弓弩
|
||||
郑州弓弩专卖
|
||||
赵氏弓弩专卖网
|
||||
赵氏弓弩专卖店
|
||||
赵氏弓弩专卖
|
||||
赵氏弓弩销售
|
||||
小型弓弩专卖店
|
||||
小猎人弓弩网
|
||||
狩猎器材弓弩专卖
|
||||
狩猎器材弓弩
|
||||
狩猎弓弩专卖网
|
||||
狩猎弓弩专卖
|
||||
狩猎弓弩麻醉箭
|
||||
手枪式折叠三用弩
|
||||
三利达弓弩专卖网
|
||||
三利达弓弩直营
|
||||
三利达弓弩配件
|
||||
三步倒药箭批发
|
||||
三步倒弩箭专卖
|
||||
三步倒麻醉弩箭销售
|
||||
三步倒麻醉箭专卖
|
||||
三步倒麻醉箭
|
||||
三步倒捕狗药
|
||||
军用弓弩专卖网
|
||||
军用弓弩专卖店
|
||||
军用弓弩批发
|
||||
军用弓弩公司
|
||||
供应三利达弓弩麻醉箭
|
||||
供应三步倒麻醉箭
|
||||
供应秦氏弓弩
|
||||
供应弩用麻醉箭
|
||||
供应弩捕狗箭
|
||||
供应麻醉箭三步倒
|
||||
供应麻醉箭批发
|
||||
供应麻醉箭
|
||||
供应军用弩折叠弩
|
||||
供应军用弓弩专卖
|
||||
供应精品弓弩
|
||||
供应弓弩麻醉箭
|
||||
供应弓弩
|
||||
供应钢珠弓弩
|
||||
弓弩商城专卖
|
||||
弓弩商城
|
||||
弓弩亲兄弟货到付款
|
||||
弓弩批发
|
||||
弓弩免定金货到付款
|
||||
弓弩麻醉箭
|
||||
弓弩麻醉镖
|
||||
弓弩论坛
|
||||
钢珠弓弩专卖网
|
||||
钢珠弓弩专卖店
|
||||
打狗弓弩三步倒
|
||||
麻醉弓弩专卖店
|
||||
出售军刀
|
||||
出售军刺
|
||||
出售弹簧刀
|
||||
出售三棱刀
|
||||
出售跳刀
|
||||
军刀网
|
||||
南方军刀网
|
||||
户外军刀网
|
||||
三棱军刺专卖
|
||||
出售开山刀军刺
|
||||
西点军刀网
|
||||
军刀专卖
|
||||
戈博军刀
|
||||
阿兰德龙户外
|
||||
出售军品军刀
|
||||
勃朗宁军刀
|
||||
军刀军品网
|
||||
阿兰得龙野营刀具网
|
||||
出售军刺军刀
|
||||
警用刀具出售
|
||||
折刀专卖网
|
||||
阳江军品军刀网
|
||||
野营刀专卖
|
||||
砍刀精品折刀专卖
|
||||
匕首蝴蝶甩刀专卖
|
||||
军刀专卖军刺
|
||||
军刀专卖刀具批发
|
||||
军刀图片砍刀
|
||||
军刀网军刀专卖
|
||||
军刀价格军用刀具
|
||||
军品军刺网
|
||||
军刀军刺甩棍
|
||||
阳江刀具批发网
|
||||
北方先锋军刀
|
||||
正品军刺出售
|
||||
野营军刀出售
|
||||
开山刀砍刀出售
|
||||
仿品军刺出售
|
||||
军刀直刀专卖
|
||||
手工猎刀专卖
|
||||
自动跳刀专卖
|
||||
军刀电棍销售
|
||||
军刀甩棍销售
|
||||
美国军刀出售
|
||||
极端武力折刀
|
||||
防卫棍刀户外刀具
|
||||
阿兰德龙野营刀
|
||||
仿品军刺网
|
||||
野营砍刀户外军刀
|
||||
手工猎刀户外刀具
|
||||
中国户外刀具网
|
||||
西点军品军刀网
|
||||
野营开山刀军刺
|
||||
三利达弓弩军刀
|
||||
尼泊尔军刀出售
|
||||
防卫野营砍刀出售
|
||||
防卫著名军刀出售
|
||||
防卫棍刀出售
|
||||
防卫甩棍出售
|
||||
防卫电棍出售
|
||||
军刺野营砍刀出售
|
||||
著名精品折刀出售
|
||||
战术军刀出售
|
||||
刺刀专卖网
|
||||
户外军刀出售
|
||||
阳江刀具直销网
|
||||
冷钢刀具直销网
|
||||
防卫刀具直销网
|
||||
极端武力直销网
|
||||
刀具直销网
|
||||
军刀直销网
|
||||
直刀匕首直销网
|
||||
军刀匕首直销网
|
||||
折刀砍刀军品网
|
||||
野营刀具军品网
|
||||
阳江刀具军品网
|
||||
冷钢刀具军品网
|
||||
防卫刀具军品网
|
||||
极端武力军品网
|
||||
军用刀具军品网
|
||||
军刀直刀军品网
|
||||
折刀砍刀专卖
|
||||
野营刀具专卖
|
||||
阳江刀具专卖
|
||||
冷钢刀具专卖
|
||||
防卫刀具专卖
|
||||
出售美军现役军刀
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FileWithShortOrLongLengthFilter',
|
||||
module_path="ops.filter.remove_file_with_short_or_long_length.process")
|
||||
@@ -0,0 +1,34 @@
|
||||
name: '文档字数检查'
|
||||
name_en: 'Word Count Check'
|
||||
description: '字数不在指定范围会被过滤掉。'
|
||||
description_en: 'Filters out documents whose word count is not in the specified range.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'FileWithShortOrLongLengthFilter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '过短文本'
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
settings:
|
||||
fileLength:
|
||||
name: 文档字数
|
||||
description: '过滤字数不在指定范围内的文档,如[10,10000000]。若输入为空,则不对字数上/下限做限制。'
|
||||
type: range
|
||||
properties:
|
||||
- name: fileMinimumLength
|
||||
type: inputNumber
|
||||
defaultVal: 10
|
||||
min: 0
|
||||
max: 10000000000000000
|
||||
step: 1
|
||||
- name: fileMaximumLength
|
||||
type: inputNumber
|
||||
defaultVal: 10000000
|
||||
min: 0
|
||||
max: 10000000000000000
|
||||
step: 1
|
||||
@@ -0,0 +1,54 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 词数目不在指定范围会被过滤掉(支持自定义阈值)
|
||||
Create: 2025/01/16
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
|
||||
class FileWithShortOrLongLengthFilter(Filter):
|
||||
"""检查文档字数目,词数目不在指定范围会被过滤掉(支持自定义阈值)"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
file_length_list = kwargs.get("fileLength", [10, 10000000]) # [下限,上限],默认字数下限为10, 默认字数上限为10000000
|
||||
if len(file_length_list) != 2: # 要求传入字数目上限和字数目下限
|
||||
logger.error(f"method: FileWithShortOrLongLengthFilter expected 2 arguments, got {len(file_length_list)}")
|
||||
raise RuntimeError(82001, "method: FileWithShortOrLongLengthFilter expected 2 arguments") from None
|
||||
# 用户不输入下限参数时前端传入'',则不对字数目下限控制
|
||||
self._file_minimum_length = 0 if not file_length_list[0] else file_length_list[0]
|
||||
# 用户不输入上限参数时前端传入'',则不对字数目上限控制
|
||||
self._file_maximum_length = float("inf") if not file_length_list[1] else file_length_list[1]
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._file_with_short_or_long_length_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileWithShortOrLongLengthFilter costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
def _strip_unicode_whitespace(self, text: str):
|
||||
# 常见 Unicode 空格符(涵盖普通空格、全角空格、零宽空格等)
|
||||
pattern = r'[\u0020\u00A0\u1680\u2000-\u200F\u202F\u205F\u3000]+'
|
||||
# 匹配首尾的空格符
|
||||
pattern = fr'^{pattern}|{pattern}$'
|
||||
return re.sub(pattern, '', text)
|
||||
|
||||
def _file_with_short_or_long_length_filter(self, input_data: str, file_name):
|
||||
input_data_tmp = self._strip_unicode_whitespace(input_data)
|
||||
if len(input_data_tmp) < self._file_minimum_length or len(input_data_tmp) > self._file_maximum_length:
|
||||
logger.info(f"The length of input_data is: {len(input_data_tmp)}, "
|
||||
f"which is not within the threshold range of {self._file_minimum_length} "
|
||||
f"and {self._file_maximum_length}. {file_name} is filtered.")
|
||||
return ""
|
||||
return input_data
|
||||
25
runtime/ops/formatter/__init__.py
Normal file
25
runtime/ops/formatter/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datamate.core.base_op import OPERATORS
|
||||
from datamate.common.utils.custom_importer import CustomImporter
|
||||
|
||||
|
||||
def _configure_importer():
|
||||
base_path = Path(__file__).resolve().parent
|
||||
sys.meta_path.append(CustomImporter(base_path))
|
||||
|
||||
|
||||
_configure_importer()
|
||||
|
||||
|
||||
def _import_operators():
|
||||
from . import text_formatter
|
||||
from . import word_formatter
|
||||
from . import img_formatter
|
||||
from . import file_exporter
|
||||
from . import slide_formatter
|
||||
|
||||
|
||||
_import_operators()
|
||||
6
runtime/ops/formatter/file_exporter/__init__.py
Normal file
6
runtime/ops/formatter/file_exporter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FileExporter',
|
||||
module_path="ops.formatter.file_exporter.process")
|
||||
16
runtime/ops/formatter/file_exporter/metadata.yml
Normal file
16
runtime/ops/formatter/file_exporter/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '落盘算子'
|
||||
name_en: 'save file operator'
|
||||
description: '将文件内容保存为文件。'
|
||||
description_en: 'Save the file data as a file.'
|
||||
language: 'Python'
|
||||
vendor: 'Huawei'
|
||||
raw_id: 'FileExporter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'others'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'all'
|
||||
outputs: 'all'
|
||||
144
runtime/ops/formatter/file_exporter/process.py
Normal file
144
runtime/ops/formatter/file_exporter/process.py
Normal file
@@ -0,0 +1,144 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: Json文本抽取
|
||||
Create: 2024/06/06 15:43
|
||||
"""
|
||||
import time
|
||||
import os
|
||||
import uuid
|
||||
from typing import Tuple, Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.constant import Fields
|
||||
from datamate.core.base_op import Mapper
|
||||
from datamate.common.utils import check_valid_path
|
||||
|
||||
|
||||
class FileExporter(Mapper):
|
||||
"""把输入的json文件流抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(FileExporter, self).__init__(*args, **kwargs)
|
||||
self.last_ops = True
|
||||
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
|
||||
'xml', 'json', 'doc', 'docx', 'pdf'])
|
||||
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
|
||||
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = sample[self.filetype_key]
|
||||
|
||||
try:
|
||||
start = time.time()
|
||||
if file_type in self.text_support_ext:
|
||||
sample, save_path = self.get_textfile_handler(sample)
|
||||
elif file_type in self.data_support_ext:
|
||||
sample, save_path = self.get_datafile_handler(sample)
|
||||
elif file_type in self.medical_support_ext:
|
||||
sample, save_path = self.get_medicalfile_handler(sample)
|
||||
else:
|
||||
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
|
||||
|
||||
if sample[self.text_key] == '' and sample[self.data_key] == b'':
|
||||
sample[self.filesize_key] = "0"
|
||||
return sample
|
||||
|
||||
if save_path:
|
||||
self.save_file(sample, save_path)
|
||||
sample[self.text_key] = ''
|
||||
sample[self.data_key] = b''
|
||||
sample[Fields.result] = True
|
||||
|
||||
file_type = save_path.split('.')[-1]
|
||||
sample[self.filetype_key] = file_type
|
||||
|
||||
base_name, _ = os.path.splitext(file_name)
|
||||
new_file_name = base_name + '.' + file_type
|
||||
sample[self.filename_key] = new_file_name
|
||||
|
||||
base_name, _ = os.path.splitext(save_path)
|
||||
sample[self.filepath_key] = base_name
|
||||
file_size = os.path.getsize(base_name)
|
||||
sample[self.filesize_key] = f"{file_size}"
|
||||
|
||||
logger.info(f"origin file named {file_name} has been save to {save_path}")
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileExporter costs {time.time() - start:.6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.error(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileExporter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
|
||||
def get_save_path(self, sample: Dict[str, Any], target_type) -> str:
|
||||
export_path = os.path.abspath(sample[self.export_path_key])
|
||||
file_name = sample[self.filename_key]
|
||||
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
|
||||
|
||||
if not check_valid_path(export_path):
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
res = os.path.join(export_path, new_file_name)
|
||||
return res
|
||||
|
||||
def get_textfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
||||
target_type = sample.get("target_type", None)
|
||||
|
||||
# target_type存在则保存为扫描件, docx格式
|
||||
if target_type:
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
# 不存在则保存为txt文件,正常文本清洗
|
||||
else:
|
||||
sample = self._get_from_text(sample)
|
||||
save_path = self.get_save_path(sample, 'txt')
|
||||
return sample, save_path
|
||||
|
||||
def get_datafile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
||||
target_type = sample.get("target_type", None)
|
||||
|
||||
# target_type存在, 图转文保存为target_type,markdown格式
|
||||
if target_type:
|
||||
sample = self._get_from_text(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
# 不存在则保存为原本图片文件格式,正常图片清洗
|
||||
else:
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, sample[self.filetype_key])
|
||||
return sample, save_path
|
||||
|
||||
def get_medicalfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
||||
target_type = 'png'
|
||||
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
|
||||
return sample, save_path
|
||||
|
||||
def save_file(self, sample, save_path):
|
||||
file_name, _ = os.path.splitext(save_path)
|
||||
# 以二进制格式保存文件
|
||||
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
|
||||
with open(file_name, 'wb') as f:
|
||||
f.write(file_sample)
|
||||
# 获取父目录路径
|
||||
|
||||
parent_dir = os.path.dirname(file_name)
|
||||
os.chmod(parent_dir, 0o770)
|
||||
os.chmod(file_name, 0o640)
|
||||
|
||||
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
sample[self.data_key] = bytes(sample[self.data_key])
|
||||
sample[self.text_key] = ''
|
||||
return sample
|
||||
|
||||
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
sample[self.data_key] = b''
|
||||
sample[self.text_key] = str(sample[self.text_key])
|
||||
return sample
|
||||
|
||||
def _get_uuid(self):
|
||||
res = str(uuid.uuid4())
|
||||
return res
|
||||
6
runtime/ops/formatter/img_formatter/__init__.py
Normal file
6
runtime/ops/formatter/img_formatter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgFormatter',
|
||||
module_path="ops.formatter.img_formatter.process")
|
||||
16
runtime/ops/formatter/img_formatter/metadata.yml
Normal file
16
runtime/ops/formatter/img_formatter/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '读取图片文件'
|
||||
name_en: 'Image File Reader'
|
||||
description: '读取图片文件。'
|
||||
description_en: 'Reads image files.'
|
||||
language: 'Python'
|
||||
vendor: 'Huawei'
|
||||
raw_id: 'ImgFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
35
runtime/ops/formatter/img_formatter/process.py
Normal file
35
runtime/ops/formatter/img_formatter/process.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# # -- encoding: utf-8 --
|
||||
|
||||
#
|
||||
# Description:
|
||||
# Create: 2024/1/30 15:24
|
||||
# """
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import numpy_to_bytes
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgFormatter(Mapper):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
file_path = sample[self.filepath_key]
|
||||
img_data = _img_extract(file_path)
|
||||
sample[self.data_key] = numpy_to_bytes(img_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgExtract costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
|
||||
def _img_extract(file_path):
|
||||
return cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)
|
||||
6
runtime/ops/formatter/slide_formatter/__init__.py
Normal file
6
runtime/ops/formatter/slide_formatter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='SlideFormatter',
|
||||
module_path="ops.formatter.slide_formatter.process")
|
||||
16
runtime/ops/formatter/slide_formatter/metadata.yml
Normal file
16
runtime/ops/formatter/slide_formatter/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '病理图片内容抽取'
|
||||
name_en: 'Pathology Image Content Extraction'
|
||||
description: '解析病理图片。'
|
||||
description_en: 'Analyze pathological images.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'SlideFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
36
runtime/ops/formatter/slide_formatter/process.py
Normal file
36
runtime/ops/formatter/slide_formatter/process.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 医疗图片解析载入
|
||||
Create: 2025/02/08 11:00
|
||||
"""
|
||||
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class SlideFormatter(Mapper):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SlideFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
'''
|
||||
Read medical image and corresponding mask file, each as Image type in COntent value. Return Content.
|
||||
'''
|
||||
start = time.time()
|
||||
file_type = sample[self.filetype_key]
|
||||
types_openslide = ['svs', 'tif', 'dcm', 'vms', 'vmu',
|
||||
'ndpi', 'scn', 'mrxs', 'tiff', 'svslide',
|
||||
'bif', 'czi', 'sdpc']
|
||||
if file_type not in types_openslide:
|
||||
raise TypeError(f"Format not supported: {file_type}. Supported formats are: {', '.join(types_openslide)}.")
|
||||
|
||||
file_name = sample[self.filename_key]
|
||||
logger.info(f"fileName: {file_name}, method: SlideFormatter costs {(time.time() - start):6f} s")
|
||||
# Not really loading the slide, instead, use path as lazy loading.
|
||||
return sample
|
||||
6
runtime/ops/formatter/text_formatter/__init__.py
Normal file
6
runtime/ops/formatter/text_formatter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='TextFormatter',
|
||||
module_path="ops.formatter.text_formatter.process")
|
||||
16
runtime/ops/formatter/text_formatter/metadata.yml
Normal file
16
runtime/ops/formatter/text_formatter/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: 'TXT文本抽取'
|
||||
name_en: 'TXT Text Extraction'
|
||||
description: '抽取TXT中的文本'
|
||||
description_en: 'Extracts text from TXT files.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'TxtFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
44
runtime/ops/formatter/text_formatter/process.py
Normal file
44
runtime/ops/formatter/text_formatter/process.py
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: Json文本抽取
|
||||
Create: 2024/06/06 15:43
|
||||
"""
|
||||
import time
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class TextFormatter(Mapper):
|
||||
"""把输入的json文件流抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TextFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _extract_json(byte_io):
|
||||
"""将默认使用utf-8编码的Json文件流解码,抽取为txt"""
|
||||
# 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
|
||||
return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
|
||||
|
||||
def byte_read(self, sample: Dict[str, Any]):
|
||||
filepath = sample[self.filepath_key]
|
||||
with open(filepath, "rb") as file:
|
||||
byte_data = file.read()
|
||||
sample[self.data_key] = byte_data
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
try:
|
||||
self.byte_read(sample)
|
||||
sample[self.text_key] = self._extract_json(sample[self.data_key])
|
||||
sample[self.data_key] = b"" # 将sample[self.data_key]置空
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
6
runtime/ops/formatter/word_formatter/__init__.py
Normal file
6
runtime/ops/formatter/word_formatter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='WordFormatter',
|
||||
module_path="ops.formatter.word_formatter.process")
|
||||
16
runtime/ops/formatter/word_formatter/metadata.yml
Normal file
16
runtime/ops/formatter/word_formatter/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: 'Word文本抽取'
|
||||
name_en: 'Word Text Extraction'
|
||||
description: '抽取Word中的文本'
|
||||
description_en: 'Extracts text from Word files.'
|
||||
language: 'java'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'WordFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
68
runtime/ops/formatter/word_formatter/process.py
Normal file
68
runtime/ops/formatter/word_formatter/process.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# # -- encoding: utf-8 --
|
||||
|
||||
#
|
||||
# Description:
|
||||
# Create: 2024/1/30 15:24
|
||||
# """
|
||||
from loguru import logger
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.common.utils import check_valid_path
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class WordFormatter(Mapper):
|
||||
SEPERATOR = ' | '
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(WordFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_path = sample[self.filepath_key]
|
||||
file_type = sample[self.filetype_key]
|
||||
txt_content = self.word2html(file_path, file_type)
|
||||
sample[self.text_key] = txt_content
|
||||
logger.info(f"fileName: {file_name}, method: WordFormatter costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
@staticmethod
|
||||
def word2html(file_path, file_type):
|
||||
check_valid_path(file_path)
|
||||
file_dir = file_path.rsplit('/', 1)[0]
|
||||
file_name = file_path.rsplit('/', 1)[1]
|
||||
html_file_path = os.path.join(file_dir, f"{file_name}.txt")
|
||||
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
['java', '-jar', f'{current_file_path}/../../../java_operator/WordFormatter-1.0.jar', file_path,
|
||||
html_file_path, file_type], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
stdout, stderr = process.communicate(timeout=24 * 60 * 60)
|
||||
if process.returncode == 0:
|
||||
logger.info(f"Convert {file_path} successfully to DOCX")
|
||||
else:
|
||||
logger.info(f"Convert {file_path} failed, error: {stderr.strip().decode('utf-8')}.")
|
||||
raise RuntimeError()
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Convert failed: {e}, return code: {e.returncode}")
|
||||
except FileNotFoundError:
|
||||
logger.error("LibreOffice command not found, please make sure it is available in PATH")
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred, convert failed: {e}", )
|
||||
|
||||
try:
|
||||
with open(html_file_path, 'r', encoding='utf-8') as file:
|
||||
txt_content = file.read()
|
||||
os.remove(html_file_path)
|
||||
logger.info("Tmp docx file removed")
|
||||
except FileNotFoundError:
|
||||
logger.error(f"Tmp file {html_file_path} does not exist")
|
||||
except PermissionError:
|
||||
logger.error(f"You are not allowed to delete tmp file {html_file_path}")
|
||||
logger.info(f"Convert {html_file_path} to html success")
|
||||
return txt_content
|
||||
25
runtime/ops/llms/__init__.py
Normal file
25
runtime/ops/llms/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
since:
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datamate.common.utils.custom_importer import CustomImporter
|
||||
|
||||
|
||||
def _configure_importer():
|
||||
base_path = Path(__file__).resolve().parent
|
||||
sys.meta_path.append(CustomImporter(base_path))
|
||||
|
||||
|
||||
_configure_importer()
|
||||
|
||||
|
||||
def _import_operators():
|
||||
from . import qa_condition_evaluator
|
||||
from . import text_quality_evaluation
|
||||
|
||||
|
||||
_import_operators()
|
||||
10
runtime/ops/llms/qa_condition_evaluator/__init__.py
Normal file
10
runtime/ops/llms/qa_condition_evaluator/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
since:
|
||||
"""
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='QAConditionEvaluator',
|
||||
module_path="ops.llms.qa_condition_evaluator.process")
|
||||
16
runtime/ops/llms/qa_condition_evaluator/metadata.yml
Normal file
16
runtime/ops/llms/qa_condition_evaluator/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: 'QA评估'
|
||||
name_en: 'QA Assessment'
|
||||
description: '通过用户维度和相应描述进行QA对评估。'
|
||||
description_en: 'Perform QA assessment based on the user dimension and corresponding description.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'QAConditionEvaluator'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'consolidate'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
98
runtime/ops/llms/qa_condition_evaluator/process.py
Normal file
98
runtime/ops/llms/qa_condition_evaluator/process.py
Normal file
@@ -0,0 +1,98 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 基于LLM通过用户设置维度和相应描述进行QA对评估
|
||||
Create: 2023/11/7 9:26
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
from datamate.core.base_op import LLM
|
||||
|
||||
|
||||
class QAConditionEvaluator(LLM):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(QAConditionEvaluator, self).__init__(*args, **kwargs)
|
||||
self.pattern = r'结果[::] ?[YN]'
|
||||
self.template_path = Path(__file__).parent / "resources/template.txt"
|
||||
self.examples_path = Path(__file__).parent / "resources/examples.json"
|
||||
self.task_id = kwargs.get("taskId", "default_id")
|
||||
self.dimensions = kwargs.get("dimension", [
|
||||
{
|
||||
"dimension": "回答是否有针对性",
|
||||
"description": "回答应对问题中的所有疑问点提供正面、直接的回答,"
|
||||
"不应引起疑惑。同时,答案不应有任何内容的遗漏,需构成一个完整的陈述。"
|
||||
},
|
||||
{
|
||||
"dimension": "问题是否独立",
|
||||
"description": "仅分析问题,问题的主体和客体都比较明确,即使有省略,也符合语言习惯。"
|
||||
"在不需要补充其他信息的情况下不会引起疑惑。"
|
||||
},
|
||||
{
|
||||
"dimension": "语法是否错误",
|
||||
"description": "问题为疑问句,答案为陈述句; 不存在词语搭配不当的情况;连接词和标点符号不存在错用情况;"
|
||||
"逻辑混乱的情况不存在;语法结构都正确且完整;"
|
||||
}
|
||||
])
|
||||
|
||||
self.llm = self.get_llm(*args, **kwargs)
|
||||
self.prompts = self.build_llm_prompt(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _process_examples(dimension_example: List) -> str:
|
||||
if not dimension_example:
|
||||
return "\n"
|
||||
res = "\n以下是一些案例供你参考:"
|
||||
for single_example in dimension_example:
|
||||
res += (f"\n问题:{single_example['question']}"
|
||||
f"\n回答:{single_example['answer']}"
|
||||
f"\n分析思路:{single_example['evaluate']}"
|
||||
f"\n结果:{single_example['result']}\n")
|
||||
return res
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
qas = json.loads(sample[self.text_key])
|
||||
single_content_res = []
|
||||
for qa in qas:
|
||||
single_qa_res = []
|
||||
for dimension, prompt in self.prompts.items():
|
||||
local_result = self._llm_call_parse(qa, prompt, retry=2)
|
||||
single_qa_res.append({"dimension": dimension, "result": local_result})
|
||||
qa_response = {"qaId": qa["qaId"], "result": single_qa_res}
|
||||
single_content_res.append(qa_response)
|
||||
|
||||
sample[self.text_key] = "Sucess"
|
||||
self.save_sample(single_content_res, sample)
|
||||
cost_time = time.time() - start
|
||||
logger.info(f"task id: {self.task_id}, method: QAConditionEvaluator costs {cost_time:.6f} s")
|
||||
return sample
|
||||
|
||||
def build_llm_prompt(self, *args, **kwargs) -> Dict:
|
||||
templates = self.template_path.read_text(encoding="utf-8")
|
||||
examples_dict = json.loads(self.examples_path.read_text(encoding="utf-8"))
|
||||
prompts_dict = {}
|
||||
for dimension in self.dimensions:
|
||||
name, des = dimension["dimension"], dimension["description"]
|
||||
dimension_example = self._process_examples(examples_dict.get(name))
|
||||
dimension_prompt = templates.format(criterion=des, examples=dimension_example, question="{question}",
|
||||
answer="{answer}")
|
||||
prompts_dict[name] = dimension_prompt
|
||||
return prompts_dict
|
||||
|
||||
def _llm_call_parse(self, data: Dict, prompt: str, retry: int = 2):
|
||||
try:
|
||||
for _ in range(retry):
|
||||
response = self.llm(prompt.format(question=data["question"], answer=data["answer"]))
|
||||
result = re.findall(self.pattern, response)
|
||||
if result:
|
||||
return "Y" in result[0]
|
||||
except RuntimeError as e:
|
||||
logger.error(f"method: QAConditionEvaluator execution error, cause by {e}")
|
||||
return False
|
||||
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"回答是否有针对性": [
|
||||
{
|
||||
"question": "下发 IO 测试的命令格式是什么??",
|
||||
"answer": "下发 IO 测试的命令格式为:",
|
||||
"evaluate": "回答正面直接的回答了问题(命令格式是什么),但是答案以冒号结尾并不是完整的陈述,因此结果是N",
|
||||
"result": "N"
|
||||
},
|
||||
{
|
||||
"question": "HyperCDP的约束关系是什么?",
|
||||
"answer": "HyperCDP特性和OceanStor Dorado的其他功能之间存在一定的约束关系。",
|
||||
"evaluate": "回答提及到了HyperCDP特性和OceanStor Dorado的其他功能之间存在约束关系,并未直接回答约束关系是什么,因此结果是N。",
|
||||
"result": "N"
|
||||
}
|
||||
],
|
||||
"问题是否独立": [
|
||||
{
|
||||
"question": "这段文档主要讲述了什么内容?",
|
||||
"answer": "这段文档主要讲述了在治疗月经失调和怀孕问题时,需要根据患者的具体症状来调整治疗方法。主要方法包括温养肾气、调理气血、调补冲任、疏肝解郁、燥湿化痰、理气调经以及活血调经等。",
|
||||
"evaluate": "问题中省略了“这段文档”的具体指代,如果没有额外的上下文信息,单独这个问题可能会引起疑惑,因为听者或读者可能不知道指的是哪篇文章。在大多数情况下,为了使问题更加明确,提问者应该提供文章的标题、作者或相关背景信息。",
|
||||
"result": "N"
|
||||
},
|
||||
{
|
||||
"question": "这些鱼体内的不饱和脂肪酸含量非常高,这是为什么?",
|
||||
"answer": "因为这些鱼体内的脂肪酸构成比较特殊,不饱和脂肪酸的含量非常高,而饱和脂肪酸的含量则较低。",
|
||||
"evaluate": "在中文提问的语言习惯中,问题中使用“这些”来指代一组已经提及或已知的鱼是合适的。然而,问题本身并没有提供足够的信息来确定“这些鱼”具体指的是哪些鱼,这就造成了主体的不明确。",
|
||||
"result": "N"
|
||||
},
|
||||
{
|
||||
"question": "如何食用黑芝麻以达到最佳效果?",
|
||||
"answer": "我们平日里吃到的芝麻制品多为芝麻酱和芝麻香油。吃整粒的芝麻对于营养的吸收来说并不是最好的,因为芝麻的外面有一层稍硬的膜,只有把它碾碎,其中的营养素才能被吸收。所以,芝麻最好碾碎了再吃。",
|
||||
"evaluate": "在这个问题中,主体“黑芝麻”是具体的食物,客体“食用方法以达到最佳效果”也是明确的,因为人们通常关心如何食用某种食物以获得最大的健康益处。因此这个问题是符合标准的。",
|
||||
"result": "Y"
|
||||
}
|
||||
]
|
||||
}
|
||||
107
runtime/ops/llms/qa_condition_evaluator/resources/readme.md
Normal file
107
runtime/ops/llms/qa_condition_evaluator/resources/readme.md
Normal file
@@ -0,0 +1,107 @@
|
||||
# QA评估插件
|
||||
## 背景
|
||||
基于维度和描述对QA对进行评估,支持用户自定义维度。
|
||||
### 约束:
|
||||
- 维度小于10个
|
||||
- 维度名称低于20个字
|
||||
- 依赖大模型服务,服务输入输出如下:
|
||||
```python
|
||||
# 输入
|
||||
request_template = {
|
||||
"prompt": "你好",
|
||||
"max_length": 2024,
|
||||
"top_n": 0.9,
|
||||
"temperature": 0.9
|
||||
}
|
||||
# 输出
|
||||
response_template = {
|
||||
"response":"XXX"
|
||||
}
|
||||
```
|
||||
#### 默认3个维度:
|
||||
- 问题是否独立
|
||||
- 问答是否针对
|
||||
- 语法是否错误
|
||||
|
||||
## 调用接口输入
|
||||
```python
|
||||
inputs = [[
|
||||
{
|
||||
"businessData": {
|
||||
"params": {
|
||||
"taskId":1,
|
||||
"LLMUrl":"https://x.x.x.x:xxxx/qwen",
|
||||
"LLMHeaders":{"Content-Type": "application/json","User-Agent":"Client"},
|
||||
"LLMBody":{
|
||||
"prompt": "你好",
|
||||
"max_length": 2024,
|
||||
"top_n": 0.9,
|
||||
"temperature": 0.9
|
||||
},
|
||||
"dimension":[
|
||||
{"dimension":"回答是否有针对性",
|
||||
"description":"回答应对问题中的所有疑问点提供正面、直接的回答,不应引起疑惑。同时,答案不应有任何内容的遗漏,需构成一个完整的陈述。"
|
||||
},
|
||||
{"dimension":"问题是否独立",
|
||||
"description":"仅分析问题,问题的主体和客体都比较明确,即使有省略,也符合语言习惯。在不需要补充其他信息的情况下不会引起疑惑。"
|
||||
},
|
||||
{"dimension":"语法是否错误",
|
||||
"description":"问题为疑问句,答案为陈述句; 不存在词语搭配不当的情况;连接词和标点符号不存在错用情况;逻辑混乱的情况不存在;语法结构都正确且完整;"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"passData": {
|
||||
"data": "",
|
||||
"text": "[{\"question\":\"什么是秋燥、秋困和秋冻?\",\"answer\":\"秋燥、秋困和秋冻是秋天常见的三种症状和养生问题。秋燥是指秋天天气干燥,导致人体水分流失,出现皮肤发痒、嘴唇起皮、鼻咽干燥等症状;秋困是指秋天天气凉爽,人体代谢下降,导致人感到无精打采、呵欠连天、昏昏欲睡等症状;秋冻是指秋天气温下降,人体需要适应气温的变化,不能一下子穿上很多衣服,让身体适应气温的变化。\",\"qaId\":1}]",
|
||||
"meta": {
|
||||
}
|
||||
},
|
||||
"contextData": {}
|
||||
}
|
||||
]]
|
||||
|
||||
```
|
||||
调用接口输出
|
||||
```python
|
||||
outputs = [
|
||||
{
|
||||
"businessData": {
|
||||
"params": {
|
||||
"taskId": 1,
|
||||
"LLMUrl": "https://x.x.x.x:xxxx/qwen",
|
||||
"LLMHeaders": {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "Client"
|
||||
},
|
||||
"LLMBody": {
|
||||
"prompt": "你好",
|
||||
"max_length": 2024,
|
||||
"top_n": 0.9,
|
||||
"temperature": 0.9
|
||||
},
|
||||
"dimension": [
|
||||
{
|
||||
"dimension": "回答是否有针对性",
|
||||
"description": "回答应对问题中的所有疑问点提供正面、直接的回答,不应引起疑惑。同时,答案不应有任何内容的遗漏,需构成一个完整的陈述。"
|
||||
},
|
||||
{
|
||||
"dimension": "问题是否独立",
|
||||
"description": "仅分析问题,问题的主体和客体都比较明确,即使有省略,也符合语言习惯。在不需要补充其他信息的情况下不会引起疑惑。"
|
||||
},
|
||||
{
|
||||
"dimension": "语法是否错误",
|
||||
"description": "问题为疑问句,答案为陈述句; 不存在词语搭配不当的情况;连接词和标点符号不存在错用情况;逻辑混乱的情况不存在;语法结构都正确且完整;"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"passData": {
|
||||
"data": "",
|
||||
"text": "[{\"qaId\": 1, \"result\": [{\"dimension\": \"\回\答\是\否\有\针\对\性\", \"result\": true}, {\"dimension\": \"\问\题\是\否\独\立\", \"result\": true}, {\"dimension\": \"\语\法\是\否\错\误\", \"result\": true}]}]",
|
||||
"meta": {}
|
||||
},
|
||||
"contextData": {}
|
||||
}
|
||||
]
|
||||
```
|
||||
@@ -0,0 +1,17 @@
|
||||
你将会获得一个问答对,判断问答对是否满足以下标准:
|
||||
标准:"{criterion}"
|
||||
|
||||
要求:
|
||||
1. 结合以上标准,一步一步的分析问答对是否满足标准,按照模板输出你的回答。
|
||||
2. 如果你对自己的判断没有较强的信心,直接算作不满足标准。
|
||||
3. 你的最终裁定应该是'Y'表示是(符合标准)或'N'表示否(不符合标准)。
|
||||
4. 如果你的回答不符合模板格式和规范,重新思考回答。
|
||||
{examples}
|
||||
问答对:
|
||||
问题:"{question}"
|
||||
答案:"{answer}"
|
||||
|
||||
模板:
|
||||
结果:[插入结果N或Y]
|
||||
分析思路:XXX
|
||||
"""
|
||||
6
runtime/ops/llms/text_quality_evaluation/__init__.py
Normal file
6
runtime/ops/llms/text_quality_evaluation/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='TextQualityEvaluation',
|
||||
module_path="ops.llms.text_quality_evaluation.process")
|
||||
43
runtime/ops/llms/text_quality_evaluation/constant.py
Normal file
43
runtime/ops/llms/text_quality_evaluation/constant.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 指令数据生成常量
|
||||
Create: 2023/11/20 16:20
|
||||
"""
|
||||
|
||||
EVAL_DIMENSION_MAP = [
|
||||
{
|
||||
"dimension": "完备性",
|
||||
"description": "数据的记录和信息是否是完整的,是否存在缺失的情况",
|
||||
"score_name": "qua_score"
|
||||
},
|
||||
{
|
||||
"dimension": "一致性",
|
||||
"description": "同一指标在不同地方的结果是否一致",
|
||||
"score_name": "logic_score"
|
||||
},
|
||||
{
|
||||
"dimension": "有效性",
|
||||
"description": "该样本涉及某领域的信息量",
|
||||
"score_name": "effective_score"
|
||||
}
|
||||
]
|
||||
|
||||
BUSINESS_EVAL_DIMENSION_MAP = [
|
||||
{
|
||||
"dimension": "金融",
|
||||
"description": "涉及保险合同、保险问答、年报、资产负债表、金融新闻、保险从业资格CICE、基金从业资格、期货从业资格、注册会计师(CPA"
|
||||
")、理财规划师、税务师、精算师-金融数学、经济师、证券从业资格、银行从业资格等相关金融行业知识",
|
||||
"score_name": "finance_score"
|
||||
},
|
||||
{
|
||||
"dimension": "存储",
|
||||
"description": "存储",
|
||||
"score_name": "storage_score"
|
||||
},
|
||||
{
|
||||
"dimension": "医疗",
|
||||
"description": "涵盖中医科、儿科、内科、口腔科、外科、妇产科、心理科学、急诊科、感染与免疫科、生殖健康科、男性健康科、皮肤性病科、眼耳鼻喉科、神经科学、肿瘤科等医疗相关领域",
|
||||
"score_name": "medical_score"
|
||||
}
|
||||
]
|
||||
16
runtime/ops/llms/text_quality_evaluation/metadata.yml
Normal file
16
runtime/ops/llms/text_quality_evaluation/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '文本质量评估'
|
||||
name_en: 'Text Quality Evaluation'
|
||||
description: '通过用户维度和相应描述进行文本评估。'
|
||||
description_en: 'Text evaluation is performed based on user dimensions and corresponding descriptions.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'TextQualityEvaluation'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'consolidate'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
113
runtime/ops/llms/text_quality_evaluation/process.py
Normal file
113
runtime/ops/llms/text_quality_evaluation/process.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 基于LLM通过用户设置维度和相应描述进行文本质量评估
|
||||
Create: 2025/3/14 11:00
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from functools import partial
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils.text_splitter import TextSplitter
|
||||
from datamate.core.base_op import LLM
|
||||
from .constant import EVAL_DIMENSION_MAP, BUSINESS_EVAL_DIMENSION_MAP
|
||||
from .prompt_config import TEXT_QUALITY_EVALUATE_TEMPLATE
|
||||
|
||||
CHUNK_SIZE = 4000
|
||||
CHUNK_OVERLAP = 0
|
||||
|
||||
|
||||
class TextQualityEvaluation(LLM):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TextQualityEvaluation, self).__init__(*args, **kwargs)
|
||||
self.total_length = 0
|
||||
self.text_list = []
|
||||
self.total_scores = [0, 0, 0, 0, 0, 0]
|
||||
self.text_splitter = TextSplitter(1024 * 1024, CHUNK_SIZE, CHUNK_OVERLAP)
|
||||
self.pattern = r'\d+\.\d+'
|
||||
self.task_id = kwargs.get("taskId", "default_id")
|
||||
|
||||
self.llm = self.get_llm(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
tmp_text_list = self.text_splitter.split_text(sample[self.text_key])
|
||||
logger.info(f"task id: {self.task_id}, the length of chunks: {len(tmp_text_list)}")
|
||||
self.text_list = tmp_text_list
|
||||
text_res = {}
|
||||
self._evaluate_concurrently_text(text_res)
|
||||
|
||||
sample[self.text_key] = "Success"
|
||||
self.save_sample([text_res], sample)
|
||||
cost_time = time.time() - start
|
||||
logger.info(f"task id: {self.task_id}, method: TextQualityEvaluation costs {cost_time:.6f} s")
|
||||
self.text_list = []
|
||||
return sample
|
||||
|
||||
def _evaluate_concurrently_text(self, text_res, max_workers: int = 5):
|
||||
for eval_dimension in EVAL_DIMENSION_MAP + BUSINESS_EVAL_DIMENSION_MAP:
|
||||
text_res[eval_dimension["score_name"]] = 0
|
||||
self.total_scores = [0, 0, 0, 0, 0, 0]
|
||||
self.total_length = 0
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# 使用 partial 绑定多参数
|
||||
future_to_params = {
|
||||
executor.submit(
|
||||
partial(self.get_current_score_concurrently, text)): text
|
||||
for text in self.text_list
|
||||
}
|
||||
for future in as_completed(future_to_params):
|
||||
self.parse_execute_result(future, future_to_params)
|
||||
for _, eval_dimension in enumerate(EVAL_DIMENSION_MAP + BUSINESS_EVAL_DIMENSION_MAP):
|
||||
total_score = self.total_scores[_]
|
||||
text_res[eval_dimension["score_name"]] = 0
|
||||
if self.total_length > 0:
|
||||
text_res[eval_dimension["score_name"]] = total_score / self.total_length
|
||||
|
||||
def parse_execute_result(self, future, future_to_params):
|
||||
text = future_to_params[future]
|
||||
try:
|
||||
scores = future.result()
|
||||
if scores and len(scores) == len(self.total_scores):
|
||||
self.total_length += len(text)
|
||||
for _, score in enumerate(scores):
|
||||
self.total_scores[_] = self.total_scores[_] + score * len(text)
|
||||
except Exception as e:
|
||||
logger.error(f"Evaluate error, error details: {e}")
|
||||
|
||||
def get_current_score_concurrently(self, text, retry: int = 2):
|
||||
dimension_list = []
|
||||
for eval_dimension in EVAL_DIMENSION_MAP + BUSINESS_EVAL_DIMENSION_MAP:
|
||||
dimension = eval_dimension["dimension"] + ":" + eval_dimension["description"]
|
||||
dimension_list.append(dimension)
|
||||
prompt = TEXT_QUALITY_EVALUATE_TEMPLATE.format(context=text, dimension0=dimension_list[0],
|
||||
dimension1=dimension_list[1], dimension2=dimension_list[2],
|
||||
dimension3=dimension_list[3], dimension4=dimension_list[4],
|
||||
dimension5=dimension_list[5])
|
||||
retry_time = 0
|
||||
while True:
|
||||
try:
|
||||
return self.get_scores(prompt)
|
||||
except RuntimeError as e:
|
||||
if retry_time < retry:
|
||||
retry_time += 1
|
||||
else:
|
||||
logger.warning(f"Request LLM error, details: {e}")
|
||||
return []
|
||||
|
||||
def get_scores(self, prompt):
|
||||
response = self.llm(prompt)
|
||||
scores_str_list = response.split(",")
|
||||
scores = []
|
||||
for scores_str in scores_str_list:
|
||||
decimals = re.findall(self.pattern, scores_str)
|
||||
if decimals:
|
||||
score = float(decimals[-1])
|
||||
if 0 <= score <= 1:
|
||||
scores.append(score)
|
||||
logger.info(f"current evaluate scores: {scores}")
|
||||
return scores
|
||||
32
runtime/ops/llms/text_quality_evaluation/prompt_config.py
Normal file
32
runtime/ops/llms/text_quality_evaluation/prompt_config.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: prompt 配置文件
|
||||
Create: 2024/02/07
|
||||
"""
|
||||
|
||||
TEXT_QUALITY_EVALUATE_TEMPLATE = """
|
||||
===
|
||||
<Role>:
|
||||
你是一位擅长文本质量评估的数据处理专家。
|
||||
|
||||
===
|
||||
<Instructions>:
|
||||
你擅长根据已知的Context内容, 结合每个评估标准Dimension,给出该标准下文本质量评估结果,结果为0-1的小数:
|
||||
- 充分理解Context内容,质量评估时要覆盖Context的主要内容,不能随意臆想和编造。
|
||||
- 如果你对自己的判断没有较强的信心,直接算作不满足标准,输出0.0分。
|
||||
- 总计会有六个评估标准,分别是Dimension1~Dimension6,每个评估标准都需要给出对应标准下的评估分数,分数为0-1的小数。
|
||||
- 每个评估标注都只输出最终的打分,不能输出额外的内容;每个评估标准的评估结果之间用英文逗号“,”分开。
|
||||
===
|
||||
<Task>
|
||||
请基于下面的参考信息和<Instructions>,生成符合要求的内容。
|
||||
输入:
|
||||
参考信息Context是: "{context}"
|
||||
第一个评估标准Dimension0是: "{dimension0}"
|
||||
第二个评估标准Dimension1是: "{dimension1}"
|
||||
第三个评估标准Dimension2是: "{dimension2}"
|
||||
第四个评估标准Dimension3是: "{dimension3}"
|
||||
第五个评估标准Dimension4是: "{dimension4}"
|
||||
第六个评估标准Dimension5是: "{dimension5}"
|
||||
输出:
|
||||
"""
|
||||
@@ -0,0 +1,98 @@
|
||||
{
|
||||
"对文本逻辑连贯性的评分,范围1-5分": [
|
||||
{
|
||||
"question": "今天天气很好,我吃了苹果。数学题很难,天空是蓝色的。狗会叫,鸟会飞。1234567890。",
|
||||
"answer": "1",
|
||||
"evaluate": "这是一段完全没有逻辑的文字,主题不断跳跃,没有任何结构可循。",
|
||||
"result": "1"
|
||||
},
|
||||
{
|
||||
"question": "我今天早上吃了面包,然后去了公园。天气很好,但突然下起了雨。我思考人生的意义,然后决定回家吃冰淇淋。",
|
||||
"answer": "2",
|
||||
"evaluate": "内容尚可理解,但逻辑连贯性较差,主题跳跃明显。",
|
||||
"result": "2"
|
||||
},
|
||||
{
|
||||
"question": "人工智能正在改变世界。它可以帮助我们解决复杂的问题,但也带来了伦理挑战。例如,自动驾驶汽车需要做出道德决策。此外,人工智能还可以用于医疗诊断。",
|
||||
"answer": "3",
|
||||
"evaluate": "内容结构尚可,逻辑基本连贯,但存在少量混乱或跳跃。",
|
||||
"result": "3"
|
||||
},
|
||||
{
|
||||
"question": "人工智能正在改变世界。它可以帮助我们解决复杂的问题,但也带来了伦理挑战。例如,自动驾驶汽车需要做出道德决策。此外,人工智能还可以用于医疗诊断。这些应用展示了其潜力和局限性。",
|
||||
"answer": "4",
|
||||
"evaluate": "内容结构清晰,逻辑连贯,仅有极小混乱或跳跃。",
|
||||
"result": "4"
|
||||
},
|
||||
{
|
||||
"question": "人工智能正在改变世界。它可以帮助我们解决复杂的问题,但也带来了伦理挑战。例如,自动驾驶汽车需要做出道德决策。此外,人工智能还可以用于医疗诊断。这些应用展示了其潜力和局限性,同时也引发了关于技术与人类关系的深入讨论。",
|
||||
"answer": "5",
|
||||
"evaluate": "内容结构清晰,逻辑严密,无任何混乱或跳跃。",
|
||||
"result": "5"
|
||||
}
|
||||
],
|
||||
"对文本格式一致性的评分,范围1-5分": [
|
||||
{
|
||||
"question": "巴黎的埃菲尔铁塔很高,伦敦的塔桥很老,纽约的自由女神像很美。东京的涩谷很有名,新加坡的滨海湾很繁华。",
|
||||
"answer": "1",
|
||||
"evaluate": "这是一段完全没有格式一致性的文字,段落之间没有任何分隔,内容完全混乱。",
|
||||
"result": "1"
|
||||
},
|
||||
{
|
||||
"question": "巴黎的埃菲尔铁塔很高,伦敦的塔桥很老,纽约的自由女神像很美。东京的涩谷很有名,新加坡的滨海湾很繁华。这些地方都很有特色,但描述方式不统一。",
|
||||
"answer": "2",
|
||||
"evaluate": "内容尚可理解,但格式一致性较差,段落之间没有任何分隔,存在较多格式混乱。",
|
||||
"result": "2"
|
||||
},
|
||||
{
|
||||
"question": "巴黎的埃菲尔铁塔很高。伦敦的塔桥很老。纽约的自由女神像很美。东京的涩谷很有名。新加坡的滨海湾很繁华。这些地方都有独特的建筑风格。",
|
||||
"answer": "3",
|
||||
"evaluate": "内容结构尚可,格式基本一致,但存在少量格式混乱或不一致。",
|
||||
"result": "3"
|
||||
},
|
||||
{
|
||||
"question": "巴黎的埃菲尔铁塔很高。\n伦敦的塔桥很老。\n纽约的自由女神像很美。\n东京的涩谷很有名。\n新加坡的滨海湾很繁华。\n这些地方都有独特的建筑风格。",
|
||||
"answer": "4",
|
||||
"evaluate": "内容结构清晰,格式一致,仅有极小格式混乱或不一致。",
|
||||
"result": "4"
|
||||
},
|
||||
{
|
||||
"question": "### 世界著名建筑\n- **巴黎的埃菲尔铁塔**:高耸入云,象征浪漫。\n- **伦敦的塔桥**:历史悠久,充满工业风格。\n- **纽约的自由女神像**:象征自由,举世闻名。\n- **东京的涩谷**:现代都市的代表,充满活力。\n- **新加坡的滨海湾**:融合自然与现代建筑,令人惊叹。\n\n这些地方都有独特的建筑风格,展现了不同的文化特色。",
|
||||
"answer": "5",
|
||||
"evaluate": "内容结构清晰,格式完全一致,无任何混乱或格式错误。",
|
||||
"result": "5"
|
||||
}
|
||||
],
|
||||
"对文本信息完整性的评分,范围1-5分": [
|
||||
{
|
||||
"question": "这款手机很好。",
|
||||
"answer": "1",
|
||||
"evaluate": "这是一段完全没有信息完整性的文字,内容过于简单,没有任何具体信息。",
|
||||
"result": "1"
|
||||
},
|
||||
{
|
||||
"question": "这款手机很好,屏幕很大。",
|
||||
"answer": "2",
|
||||
"evaluate": "内容尚可理解,但信息完整性较差,缺乏关键细节,如性能、价格等。",
|
||||
"result": "2"
|
||||
},
|
||||
{
|
||||
"question": "这款手机很好,屏幕很大,运行速度快。",
|
||||
"answer": "3",
|
||||
"evaluate": "内容结构尚可,信息基本完整,但存在关键信息遗漏,如摄像头质量、价格等。",
|
||||
"result": "3"
|
||||
},
|
||||
{
|
||||
"question": "这款手机很好,屏幕很大,运行速度快,摄像头也很清晰。",
|
||||
"answer": "4",
|
||||
"evaluate": "内容结构清晰,信息较为完整,仅有少量关键信息遗漏。",
|
||||
"result": "4"
|
||||
},
|
||||
{
|
||||
"question": "### 这款手机的评测\n- **屏幕**:6.7英寸AMOLED,显示效果出色。\n- **性能**:搭载最新处理器,运行速度快,流畅无卡顿。\n- **摄像头**:4800万像素主摄,支持夜景模式,成像清晰。\n- **价格**:起售价为899美元,性价比高。\n- **优点**:屏幕显示效果好,性能强劲。\n- **缺点**:电池容量较小,续航一般。\n\n总体来说,这是一款综合表现优秀的手机。",
|
||||
"answer": "5",
|
||||
"evaluate": "内容结构清晰,信息完整且详细,涵盖了所有关键方面。",
|
||||
"result": "5"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
你将会获得一个问答对,判断问答对是否满足以下标准:
|
||||
标准:"{criterion}"
|
||||
|
||||
要求:
|
||||
1. 结合以上标准,一步一步的分析question文本是否满足标准,这里的question不是指一个问题,只是输入的文本,按照模板输出每个维度的分数,你的result就是分数。额外输入一个维度平均分
|
||||
2. 如果你对自己的判断没有较强的信心,直接算作不满足标准。
|
||||
3. 你的最终裁定应该是1-5的评分,严格按照examples中打分的标准。
|
||||
4. 如果你的回答不符合模板格式和规范,重新思考回答。
|
||||
{examples}
|
||||
问答对:
|
||||
问题:"{question}"
|
||||
答案:"{answer}"
|
||||
|
||||
模板:
|
||||
结果:[1或2或3或4或5]
|
||||
分析思路:XXX
|
||||
"""
|
||||
52
runtime/ops/mapper/__init__.py
Normal file
52
runtime/ops/mapper/__init__.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datamate.common.utils.custom_importer import CustomImporter
|
||||
|
||||
|
||||
def _configure_importer():
|
||||
base_path = Path(__file__).resolve().parent
|
||||
sys.meta_path.append(CustomImporter(base_path))
|
||||
|
||||
|
||||
_configure_importer()
|
||||
|
||||
|
||||
def _import_operators():
|
||||
from . import content_cleaner
|
||||
from . import credit_card_number_cleaner
|
||||
from . import email_cleaner
|
||||
from . import emoji_cleaner
|
||||
from . import extra_space_cleaner
|
||||
from . import full_width_characters_cleaner
|
||||
from . import garble_characters_cleaner
|
||||
from . import html_tag_cleaner
|
||||
from . import id_number_cleaner
|
||||
from . import img_watermark_remove
|
||||
from . import invisible_characters_cleaner
|
||||
from . import ip_address_cleaner
|
||||
from . import legend_cleaner
|
||||
from . import phone_number_cleaner
|
||||
from . import political_word_cleaner
|
||||
from . import sexual_and_violent_word_cleaner
|
||||
from . import text_to_word
|
||||
from . import traditional_chinese
|
||||
from . import unicode_space_cleaner
|
||||
from . import url_cleaner
|
||||
from . import xml_tag_cleaner
|
||||
from . import img_enhanced_brightness
|
||||
from . import img_enhanced_contrast
|
||||
from . import img_enhanced_saturation
|
||||
from . import img_enhanced_sharpness
|
||||
from . import img_perspective_transformation
|
||||
from . import img_direction_correct
|
||||
from . import img_denoise
|
||||
from . import img_shadow_remove
|
||||
from . import img_type_unify
|
||||
from . import img_resize
|
||||
from . import remove_duplicate_sentences
|
||||
from . import knowledge_relation_slice
|
||||
|
||||
|
||||
_import_operators()
|
||||
6
runtime/ops/mapper/content_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/content_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ContentCleaner',
|
||||
module_path="ops.mapper.content_cleaner.process")
|
||||
16
runtime/ops/mapper/content_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/content_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '文档目录去除'
|
||||
name_en: 'Document Contents Removal'
|
||||
description: '去除文档中的目录。'
|
||||
description_en: 'Removes tables of contents from documents.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ContentCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
64
runtime/ops/mapper/content_cleaner/process.py
Normal file
64
runtime/ops/mapper/content_cleaner/process.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 文档目录去除
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ContentCleaner(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.no_content_count = 3 # 连续不符合目录结构的行数阈值
|
||||
# 目录标题
|
||||
self.content_text_pattern = r"^ *(目 *录|CONTENT(S)?) *$"
|
||||
# 目录行 前缀格式
|
||||
self.content_preface_pattern = r"^ *(前言|About This Document|\d+(\.\d+)*|[a-zA-Z]+(\.\d+)*)"
|
||||
# 目录行 中间格式
|
||||
self.content_middle_pattern = r"\.{7,}"
|
||||
# 目录行 结尾格式
|
||||
self.content_end_pattern = r"(\d|错误!未定义书签。|[IXV]+) *$"
|
||||
self.content_pattern = self.content_preface_pattern + ".*" + self.content_end_pattern
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._content_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _content_filter(self, input_data: str):
|
||||
count = 0 # 记录不符合目录结构的次数,连续3行不满足要求,则认为已经进入正文
|
||||
# 目录起始和结束索引
|
||||
content_start_index, content_end_index = -1, -1
|
||||
lines = input_data.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
if content_start_index >= 0 and count >= self.no_content_count:
|
||||
break
|
||||
# 首先匹配目录或content字眼
|
||||
if content_start_index < 0 and re.match(self.content_text_pattern, line, re.IGNORECASE):
|
||||
content_start_index = i
|
||||
content_end_index = i
|
||||
# 匹配两种形式的目录行
|
||||
# 1. 以指定格式开始、指定格式结尾;2.该行包含点数量超过7个
|
||||
elif content_start_index >= 0 and (re.match(self.content_pattern, line, re.IGNORECASE)
|
||||
or re.search(self.content_middle_pattern, line)):
|
||||
content_end_index = i
|
||||
count = 0
|
||||
elif content_start_index >= 0 and not (re.match(self.content_pattern, line, re.IGNORECASE)
|
||||
or re.search(self.content_middle_pattern, line)):
|
||||
count += 1
|
||||
|
||||
if 0 <= content_start_index < content_end_index:
|
||||
res = "\n".join(lines[:content_start_index] + lines[content_end_index + 1:])
|
||||
else:
|
||||
# 只有目录关键字时,关键字不去除;或不符合目录结构,返回原文
|
||||
res = "\n".join(lines)
|
||||
return res
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='AnonymizedCreditCardNumber',
|
||||
module_path="ops.mapper.credit_card_number_cleaner.process")
|
||||
16
runtime/ops/mapper/credit_card_number_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/credit_card_number_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '信用卡号匿名化'
|
||||
name_en: 'Credit Card Number Anonymization'
|
||||
description: '信用卡号匿名化'
|
||||
description_en: 'Anonymizes credit card numbers.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'AnonymizedCreditCardNumber'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '这个是信用卡号:4111111111111111'
|
||||
after: '这个是信用卡号:<credit_card_number>'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
83
runtime/ops/mapper/credit_card_number_cleaner/process.py
Normal file
83
runtime/ops/mapper/credit_card_number_cleaner/process.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 信用卡号匿名化
|
||||
Create: 2024/12/5 15:43
|
||||
"""
|
||||
from loguru import logger
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class AnonymizedCreditCardNumber(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AnonymizedCreditCardNumber, self).__init__(*args, **kwargs)
|
||||
self.re_compile = self._get_credit_card_re_compile()
|
||||
|
||||
@staticmethod
|
||||
def _verify_credit_card_num(credit_card_num: str):
|
||||
"""信用卡号码校验"""
|
||||
# 从右到左翻转
|
||||
digits = [int(x) for x in reversed(credit_card_num) if x.isdigit()]
|
||||
# 对偶数位数字翻倍 d*2
|
||||
even_digits = [d * 2 for d in digits[1::2]]
|
||||
# 如果对某个数字翻倍之后结果是一个两位数,将这两位数字加在一起
|
||||
even_digits = [d // 10 + d % 10 for d in even_digits]
|
||||
# 将上一步所有一位数相加
|
||||
even_sum = sum(even_digits)
|
||||
# 将卡号里从右到左奇数位上所有数字相加
|
||||
odd_sum = sum(digits[::2])
|
||||
# 将even_sum和odd_sum相加,能被10整数为合法,否则不合法
|
||||
if (odd_sum + even_sum) % 10 == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _get_credit_card_re_compile():
|
||||
separator_symbol = r"([- ]?)"
|
||||
# American Express 以 34 或 37 开头的 15 位数号码 格式:NNNN-NNNNNN-NNNNN 或 NNNN NNNNNN NNNNN
|
||||
american_express = "3[47][0-9]{2}" + separator_symbol + "[0-9]{6}" + separator_symbol + "[0-9]{5}"
|
||||
# 中国银联 以 62 或 60 开头,是一个 16 位数号码。 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
|
||||
china_union_pay = r"(6[02]\d{2})" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
|
||||
# Diner's Club 以 300–305、36、38 或 39、3095 开头, 14 位数号码 格式:NNNN-NNNNNN-NNNN 或 NNNN NNNNNN NNNN。
|
||||
diners_club = r"(30[0-5]\d|3[689]\d{2}|3095)" + separator_symbol + r"[0-9]{6}" + separator_symbol + r"[0-9]{4}"
|
||||
# Discover 以 6011、644–649 或 65 开头的 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
|
||||
discover = r"(64[4-9]\d|65\d{2}|6011)" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
|
||||
# JCB 以 3528 到 3589 开头的 16 位数字, 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNNNNNN
|
||||
jcb = r"(352[89]|35[3-8]\d)" + separator_symbol + r"[0-9]{4}" + (
|
||||
r"((%s\d{%d}){%d}" % (separator_symbol, 4, 2) + ")|" + separator_symbol + r"[0-9]{8}")
|
||||
# Mastercard 以 51–55 或 2221–2720 开头的 16 位数字 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
|
||||
master_card = r"(5[1-5]\d{2}|222[1-9]|22[3-9]\d|2[3-6]\d{2}|27[01]\d|2720)" + r"(%s\d{%d}){%d}" \
|
||||
% (separator_symbol, 4, 3)
|
||||
# visa 以4开头 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
|
||||
visa = r"4\d{3}" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
|
||||
|
||||
credit_card_pattern = r"(?<=[^\d])(%s|%s|%s|%s|%s|%s|%s)(?=[^\d])" % (
|
||||
american_express, china_union_pay, diners_club,
|
||||
discover, jcb, master_card, visa)
|
||||
credit_card_re_compile = re.compile(credit_card_pattern)
|
||||
return credit_card_re_compile
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _credit_card_number_filter(self, input_data: str):
|
||||
"""提取信用卡号号码"""
|
||||
input_data = ''.join(['【', input_data, '】'])
|
||||
# 抽取符合信用卡正则匹配的字符串
|
||||
credit_card_nums = [item.group(1) for item in self.re_compile.finditer(input_data)]
|
||||
# 判断抽取的字符串是不是真实的信用卡号
|
||||
for credit_card_num in credit_card_nums:
|
||||
if self._verify_credit_card_num(credit_card_num):
|
||||
# 替换有效信用卡号号码为<credit_card_number>
|
||||
credit_card_num_pattern = r"(?<=[^\d]){}(?=[^\d])".format(credit_card_num)
|
||||
input_data = re.compile(credit_card_num_pattern).sub("<credit_card_number>", input_data)
|
||||
return input_data[1:-1]
|
||||
6
runtime/ops/mapper/email_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/email_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='EmailNumberCleaner',
|
||||
module_path="ops.mapper.email_cleaner.process")
|
||||
16
runtime/ops/mapper/email_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/email_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '邮件地址匿名化'
|
||||
name_en: 'Email Address Anonymization'
|
||||
description: '邮件地址匿名化'
|
||||
description_en: 'Anonymizes email addresses.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'EmailNumberCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '这个是邮箱号:test_email@gmail.com'
|
||||
after: '这个是邮箱号:<email>'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
47
runtime/ops/mapper/email_cleaner/process.py
Normal file
47
runtime/ops/mapper/email_cleaner/process.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 邮件地址匿名化
|
||||
Create: 2025/01/15
|
||||
"""
|
||||
from loguru import logger
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from email_validator import validate_email, EmailNotValidError
|
||||
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class EmailNumberCleaner(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.front_email_pattern = r'(?<=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
|
||||
self.back_email_pattern = r'(?=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
|
||||
self.email_pattern = r'([a-zA-Z\d.\-+_]+\s?@\s?[a-zA-Z\d.\-+_]+\.[a-zA-Z0-9]{2,6})'
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _email_number_filter(self, input_data: str):
|
||||
""" 邮箱匿名化"""
|
||||
mixed_data = ''.join(['龥', input_data, '龥'])
|
||||
paired_emails = re.compile(self.front_email_pattern + self.email_pattern + self.back_email_pattern).findall(
|
||||
mixed_data)
|
||||
if paired_emails:
|
||||
for email in paired_emails:
|
||||
try:
|
||||
# 验证电子邮件地址
|
||||
validate_email(email, check_deliverability=False)
|
||||
mixed_data = re.compile(self.front_email_pattern + re.escape(email) + self.back_email_pattern).sub(
|
||||
"<email>", mixed_data, count=1)
|
||||
except EmailNotValidError as err:
|
||||
# 日志打印该电子邮件地址无效(不显示具体电子邮件地址)
|
||||
logger.error(f"email is abnormal email form: {err}")
|
||||
return mixed_data[1:-1]
|
||||
6
runtime/ops/mapper/emoji_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/emoji_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='EmojiCleaner',
|
||||
module_path="ops.mapper.emoji_cleaner.process")
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user