feat(data-management): 添加数据集相似度推荐功能

- 在DatasetApplicationService中实现getSimilarDatasets方法,支持基于标签匹配的相似数据集推荐
- 新增normalizeSimilarLimit、normalizeTagNames、countSharedTags等辅助方法用于相似度计算
- 在DatasetRepository接口及其实现类中添加findSimilarByTags方法,支持数据库层面的标签匹配查询
- 在DatasetController中暴露/similar REST API端点,支持按需获取相似数据集
- 在前端Overview组件中展示相似数据集表格,包含名称、标签、类型、文件数和更新时间等信息
- 在DatasetDetail页面集成相似数据集获取逻辑,限制默认返回数量为4条
- 移除KnowledgeItem中的冗余title字段,统一使用其他标识信息
- 优化知识管理相关组件中的标题显示逻辑,移除硬编码标题值
This commit is contained in:
2026-01-30 11:43:44 +08:00
parent c51cd2b6e4
commit c221666e67
12 changed files with 481 additions and 98 deletions

View File

@@ -35,6 +35,7 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDateTime;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -48,6 +49,10 @@ import java.util.stream.Stream;
@RequiredArgsConstructor
public class DatasetApplicationService {
private static final String DATASET_PVC_NAME = "sys.management.dataset.pvc.name";
private static final int SIMILAR_DATASET_DEFAULT_LIMIT = 4;
private static final int SIMILAR_DATASET_MAX_LIMIT = 50;
private static final int SIMILAR_DATASET_CANDIDATE_FACTOR = 5;
private static final int SIMILAR_DATASET_CANDIDATE_MAX = 100;
private final DatasetRepository datasetRepository;
private final TagMapper tagMapper;
private final DatasetFileRepository datasetFileRepository;
@@ -153,6 +158,53 @@ public class DatasetApplicationService {
return PagedResponse.of(datasetResponses, page.getCurrent(), page.getTotal(), page.getPages());
}
@Transactional(readOnly = true)
public List<DatasetResponse> getSimilarDatasets(String datasetId, Integer limit) {
BusinessAssert.isTrue(StringUtils.hasText(datasetId), CommonErrorCode.PARAM_ERROR);
Dataset dataset = datasetRepository.getById(datasetId);
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
Set<String> sourceTags = normalizeTagNames(dataset.getTags());
if (sourceTags.isEmpty()) {
return Collections.emptyList();
}
int safeLimit = normalizeSimilarLimit(limit);
int candidateLimit = Math.min(
SIMILAR_DATASET_CANDIDATE_MAX,
Math.max(safeLimit * SIMILAR_DATASET_CANDIDATE_FACTOR, safeLimit)
);
List<Dataset> candidates = datasetRepository.findSimilarByTags(
new ArrayList<>(sourceTags),
datasetId,
candidateLimit
);
if (CollectionUtils.isEmpty(candidates)) {
return Collections.emptyList();
}
Map<String, Integer> scoreMap = new HashMap<>();
for (Dataset candidate : candidates) {
int score = countSharedTags(sourceTags, candidate.getTags());
if (score > 0 && candidate.getId() != null) {
scoreMap.put(candidate.getId(), score);
}
}
String datasetPvcName = getDatasetPvcName();
List<Dataset> sorted = candidates.stream()
.filter(candidate -> candidate.getId() != null && scoreMap.containsKey(candidate.getId()))
.sorted((left, right) -> {
int leftScore = scoreMap.getOrDefault(left.getId(), 0);
int rightScore = scoreMap.getOrDefault(right.getId(), 0);
if (leftScore != rightScore) {
return Integer.compare(rightScore, leftScore);
}
return resolveUpdatedTime(right).compareTo(resolveUpdatedTime(left));
})
.limit(safeLimit)
.toList();
List<DatasetResponse> responses = DatasetConverter.INSTANCE.convertToResponse(sorted);
responses.forEach(item -> item.setPvcName(datasetPvcName));
return responses;
}
/**
* 处理标签名称,创建或获取标签
*/
@@ -174,6 +226,57 @@ public class DatasetApplicationService {
return tags;
}
private int normalizeSimilarLimit(Integer limit) {
if (limit == null || limit <= 0) {
return SIMILAR_DATASET_DEFAULT_LIMIT;
}
return Math.min(limit, SIMILAR_DATASET_MAX_LIMIT);
}
private Set<String> normalizeTagNames(Collection<Tag> tags) {
if (CollectionUtils.isEmpty(tags)) {
return Collections.emptySet();
}
Set<String> normalized = new HashSet<>();
for (Tag tag : tags) {
if (tag == null || !StringUtils.hasText(tag.getName())) {
continue;
}
normalized.add(tag.getName().trim());
}
return normalized;
}
private int countSharedTags(Set<String> sourceTags, Collection<Tag> targetTags) {
if (sourceTags.isEmpty()) {
return 0;
}
Set<String> targetTagNames = normalizeTagNames(targetTags);
if (targetTagNames.isEmpty()) {
return 0;
}
int count = 0;
for (String tagName : targetTagNames) {
if (sourceTags.contains(tagName)) {
count++;
}
}
return count;
}
private LocalDateTime resolveUpdatedTime(Dataset dataset) {
if (dataset == null) {
return LocalDateTime.MIN;
}
if (dataset.getUpdatedAt() != null) {
return dataset.getUpdatedAt();
}
if (dataset.getCreatedAt() != null) {
return dataset.getCreatedAt();
}
return LocalDateTime.MIN;
}
private Dataset resolveParentDataset(String parentDatasetId, String currentDatasetId) {
String normalized = normalizeParentId(parentDatasetId);
if (normalized == null) {

View File

@@ -28,4 +28,6 @@ public interface DatasetRepository extends IRepository<Dataset> {
IPage<Dataset> findByCriteria(IPage<Dataset> page, DatasetPagingQuery query);
long countByParentId(String parentDatasetId);
List<Dataset> findSimilarByTags(List<String> tagNames, String excludedDatasetId, int limit);
}

View File

@@ -13,6 +13,8 @@ import org.apache.commons.lang3.StringUtils;
import org.apache.ibatis.session.RowBounds;
import org.springframework.stereotype.Repository;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
@@ -88,4 +90,45 @@ public class DatasetRepositoryImpl extends CrudRepository<DatasetMapper, Dataset
return datasetMapper.selectCount(new LambdaQueryWrapper<Dataset>()
.eq(Dataset::getParentDatasetId, parentDatasetId));
}
@Override
public List<Dataset> findSimilarByTags(List<String> tagNames, String excludedDatasetId, int limit) {
if (limit <= 0 || tagNames == null || tagNames.isEmpty()) {
return Collections.emptyList();
}
List<String> normalizedTags = new ArrayList<>();
for (String tagName : tagNames) {
if (StringUtils.isNotBlank(tagName)) {
normalizedTags.add(tagName.trim());
}
}
if (normalizedTags.isEmpty()) {
return Collections.emptyList();
}
LambdaQueryWrapper<Dataset> wrapper = new LambdaQueryWrapper<>();
if (StringUtils.isNotBlank(excludedDatasetId)) {
wrapper.ne(Dataset::getId, excludedDatasetId.trim());
}
wrapper.apply("tags IS NOT NULL AND JSON_VALID(tags) = 1 AND JSON_LENGTH(tags) > 0");
wrapper.and(condition -> {
boolean hasCondition = false;
for (String tagName : normalizedTags) {
if (!hasCondition) {
condition.apply(
"JSON_SEARCH(tags, 'one', {0}, NULL, '$[*].name') IS NOT NULL",
tagName
);
hasCondition = true;
continue;
}
condition.or().apply(
"JSON_SEARCH(tags, 'one', {0}, NULL, '$[*].name') IS NOT NULL",
tagName
);
}
});
wrapper.orderByDesc(Dataset::getUpdatedAt);
wrapper.last("LIMIT " + limit);
return datasetMapper.selectList(wrapper);
}
}

View File

@@ -15,6 +15,7 @@ import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import java.util.List;
import java.util.Map;
/**
@@ -113,4 +114,10 @@ public class DatasetController {
public ResponseEntity<Response<AllDatasetStatisticsResponse>> getAllStatistics() {
return ResponseEntity.ok(Response.ok(datasetApplicationService.getAllDatasetStatistics()));
}
@GetMapping("/{datasetId}/similar")
public List<DatasetResponse> getSimilarDatasets(@PathVariable("datasetId") String datasetId,
@RequestParam(name = "limit", required = false) Integer limit) {
return datasetApplicationService.getSimilarDatasets(datasetId, limit);
}
}