You've already forked DataMate
feat(data-management): 添加数据集相似度推荐功能
- 在DatasetApplicationService中实现getSimilarDatasets方法,支持基于标签匹配的相似数据集推荐 - 新增normalizeSimilarLimit、normalizeTagNames、countSharedTags等辅助方法用于相似度计算 - 在DatasetRepository接口及其实现类中添加findSimilarByTags方法,支持数据库层面的标签匹配查询 - 在DatasetController中暴露/similar REST API端点,支持按需获取相似数据集 - 在前端Overview组件中展示相似数据集表格,包含名称、标签、类型、文件数和更新时间等信息 - 在DatasetDetail页面集成相似数据集获取逻辑,限制默认返回数量为4条 - 移除KnowledgeItem中的冗余title字段,统一使用其他标识信息 - 优化知识管理相关组件中的标题显示逻辑,移除硬编码标题值
This commit is contained in:
@@ -35,6 +35,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
@@ -48,6 +49,10 @@ import java.util.stream.Stream;
|
||||
@RequiredArgsConstructor
|
||||
public class DatasetApplicationService {
|
||||
private static final String DATASET_PVC_NAME = "sys.management.dataset.pvc.name";
|
||||
private static final int SIMILAR_DATASET_DEFAULT_LIMIT = 4;
|
||||
private static final int SIMILAR_DATASET_MAX_LIMIT = 50;
|
||||
private static final int SIMILAR_DATASET_CANDIDATE_FACTOR = 5;
|
||||
private static final int SIMILAR_DATASET_CANDIDATE_MAX = 100;
|
||||
private final DatasetRepository datasetRepository;
|
||||
private final TagMapper tagMapper;
|
||||
private final DatasetFileRepository datasetFileRepository;
|
||||
@@ -153,6 +158,53 @@ public class DatasetApplicationService {
|
||||
return PagedResponse.of(datasetResponses, page.getCurrent(), page.getTotal(), page.getPages());
|
||||
}
|
||||
|
||||
@Transactional(readOnly = true)
|
||||
public List<DatasetResponse> getSimilarDatasets(String datasetId, Integer limit) {
|
||||
BusinessAssert.isTrue(StringUtils.hasText(datasetId), CommonErrorCode.PARAM_ERROR);
|
||||
Dataset dataset = datasetRepository.getById(datasetId);
|
||||
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
|
||||
Set<String> sourceTags = normalizeTagNames(dataset.getTags());
|
||||
if (sourceTags.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
int safeLimit = normalizeSimilarLimit(limit);
|
||||
int candidateLimit = Math.min(
|
||||
SIMILAR_DATASET_CANDIDATE_MAX,
|
||||
Math.max(safeLimit * SIMILAR_DATASET_CANDIDATE_FACTOR, safeLimit)
|
||||
);
|
||||
List<Dataset> candidates = datasetRepository.findSimilarByTags(
|
||||
new ArrayList<>(sourceTags),
|
||||
datasetId,
|
||||
candidateLimit
|
||||
);
|
||||
if (CollectionUtils.isEmpty(candidates)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
Map<String, Integer> scoreMap = new HashMap<>();
|
||||
for (Dataset candidate : candidates) {
|
||||
int score = countSharedTags(sourceTags, candidate.getTags());
|
||||
if (score > 0 && candidate.getId() != null) {
|
||||
scoreMap.put(candidate.getId(), score);
|
||||
}
|
||||
}
|
||||
String datasetPvcName = getDatasetPvcName();
|
||||
List<Dataset> sorted = candidates.stream()
|
||||
.filter(candidate -> candidate.getId() != null && scoreMap.containsKey(candidate.getId()))
|
||||
.sorted((left, right) -> {
|
||||
int leftScore = scoreMap.getOrDefault(left.getId(), 0);
|
||||
int rightScore = scoreMap.getOrDefault(right.getId(), 0);
|
||||
if (leftScore != rightScore) {
|
||||
return Integer.compare(rightScore, leftScore);
|
||||
}
|
||||
return resolveUpdatedTime(right).compareTo(resolveUpdatedTime(left));
|
||||
})
|
||||
.limit(safeLimit)
|
||||
.toList();
|
||||
List<DatasetResponse> responses = DatasetConverter.INSTANCE.convertToResponse(sorted);
|
||||
responses.forEach(item -> item.setPvcName(datasetPvcName));
|
||||
return responses;
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理标签名称,创建或获取标签
|
||||
*/
|
||||
@@ -174,6 +226,57 @@ public class DatasetApplicationService {
|
||||
return tags;
|
||||
}
|
||||
|
||||
private int normalizeSimilarLimit(Integer limit) {
|
||||
if (limit == null || limit <= 0) {
|
||||
return SIMILAR_DATASET_DEFAULT_LIMIT;
|
||||
}
|
||||
return Math.min(limit, SIMILAR_DATASET_MAX_LIMIT);
|
||||
}
|
||||
|
||||
private Set<String> normalizeTagNames(Collection<Tag> tags) {
|
||||
if (CollectionUtils.isEmpty(tags)) {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
Set<String> normalized = new HashSet<>();
|
||||
for (Tag tag : tags) {
|
||||
if (tag == null || !StringUtils.hasText(tag.getName())) {
|
||||
continue;
|
||||
}
|
||||
normalized.add(tag.getName().trim());
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
private int countSharedTags(Set<String> sourceTags, Collection<Tag> targetTags) {
|
||||
if (sourceTags.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
Set<String> targetTagNames = normalizeTagNames(targetTags);
|
||||
if (targetTagNames.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
int count = 0;
|
||||
for (String tagName : targetTagNames) {
|
||||
if (sourceTags.contains(tagName)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private LocalDateTime resolveUpdatedTime(Dataset dataset) {
|
||||
if (dataset == null) {
|
||||
return LocalDateTime.MIN;
|
||||
}
|
||||
if (dataset.getUpdatedAt() != null) {
|
||||
return dataset.getUpdatedAt();
|
||||
}
|
||||
if (dataset.getCreatedAt() != null) {
|
||||
return dataset.getCreatedAt();
|
||||
}
|
||||
return LocalDateTime.MIN;
|
||||
}
|
||||
|
||||
private Dataset resolveParentDataset(String parentDatasetId, String currentDatasetId) {
|
||||
String normalized = normalizeParentId(parentDatasetId);
|
||||
if (normalized == null) {
|
||||
|
||||
@@ -28,4 +28,6 @@ public interface DatasetRepository extends IRepository<Dataset> {
|
||||
IPage<Dataset> findByCriteria(IPage<Dataset> page, DatasetPagingQuery query);
|
||||
|
||||
long countByParentId(String parentDatasetId);
|
||||
|
||||
List<Dataset> findSimilarByTags(List<String> tagNames, String excludedDatasetId, int limit);
|
||||
}
|
||||
|
||||
@@ -13,6 +13,8 @@ import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.ibatis.session.RowBounds;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
@@ -88,4 +90,45 @@ public class DatasetRepositoryImpl extends CrudRepository<DatasetMapper, Dataset
|
||||
return datasetMapper.selectCount(new LambdaQueryWrapper<Dataset>()
|
||||
.eq(Dataset::getParentDatasetId, parentDatasetId));
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Dataset> findSimilarByTags(List<String> tagNames, String excludedDatasetId, int limit) {
|
||||
if (limit <= 0 || tagNames == null || tagNames.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<String> normalizedTags = new ArrayList<>();
|
||||
for (String tagName : tagNames) {
|
||||
if (StringUtils.isNotBlank(tagName)) {
|
||||
normalizedTags.add(tagName.trim());
|
||||
}
|
||||
}
|
||||
if (normalizedTags.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
LambdaQueryWrapper<Dataset> wrapper = new LambdaQueryWrapper<>();
|
||||
if (StringUtils.isNotBlank(excludedDatasetId)) {
|
||||
wrapper.ne(Dataset::getId, excludedDatasetId.trim());
|
||||
}
|
||||
wrapper.apply("tags IS NOT NULL AND JSON_VALID(tags) = 1 AND JSON_LENGTH(tags) > 0");
|
||||
wrapper.and(condition -> {
|
||||
boolean hasCondition = false;
|
||||
for (String tagName : normalizedTags) {
|
||||
if (!hasCondition) {
|
||||
condition.apply(
|
||||
"JSON_SEARCH(tags, 'one', {0}, NULL, '$[*].name') IS NOT NULL",
|
||||
tagName
|
||||
);
|
||||
hasCondition = true;
|
||||
continue;
|
||||
}
|
||||
condition.or().apply(
|
||||
"JSON_SEARCH(tags, 'one', {0}, NULL, '$[*].name') IS NOT NULL",
|
||||
tagName
|
||||
);
|
||||
}
|
||||
});
|
||||
wrapper.orderByDesc(Dataset::getUpdatedAt);
|
||||
wrapper.last("LIMIT " + limit);
|
||||
return datasetMapper.selectList(wrapper);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
@@ -113,4 +114,10 @@ public class DatasetController {
|
||||
public ResponseEntity<Response<AllDatasetStatisticsResponse>> getAllStatistics() {
|
||||
return ResponseEntity.ok(Response.ok(datasetApplicationService.getAllDatasetStatistics()));
|
||||
}
|
||||
|
||||
@GetMapping("/{datasetId}/similar")
|
||||
public List<DatasetResponse> getSimilarDatasets(@PathVariable("datasetId") String datasetId,
|
||||
@RequestParam(name = "limit", required = false) Integer limit) {
|
||||
return datasetApplicationService.getSimilarDatasets(datasetId, limit);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user