You've already forked DataMate
init datamate
This commit is contained in:
6
runtime/ops/llms/text_quality_evaluation/__init__.py
Normal file
6
runtime/ops/llms/text_quality_evaluation/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='TextQualityEvaluation',
|
||||
module_path="ops.llms.text_quality_evaluation.process")
|
||||
43
runtime/ops/llms/text_quality_evaluation/constant.py
Normal file
43
runtime/ops/llms/text_quality_evaluation/constant.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 指令数据生成常量
|
||||
Create: 2023/11/20 16:20
|
||||
"""
|
||||
|
||||
EVAL_DIMENSION_MAP = [
|
||||
{
|
||||
"dimension": "完备性",
|
||||
"description": "数据的记录和信息是否是完整的,是否存在缺失的情况",
|
||||
"score_name": "qua_score"
|
||||
},
|
||||
{
|
||||
"dimension": "一致性",
|
||||
"description": "同一指标在不同地方的结果是否一致",
|
||||
"score_name": "logic_score"
|
||||
},
|
||||
{
|
||||
"dimension": "有效性",
|
||||
"description": "该样本涉及某领域的信息量",
|
||||
"score_name": "effective_score"
|
||||
}
|
||||
]
|
||||
|
||||
BUSINESS_EVAL_DIMENSION_MAP = [
|
||||
{
|
||||
"dimension": "金融",
|
||||
"description": "涉及保险合同、保险问答、年报、资产负债表、金融新闻、保险从业资格CICE、基金从业资格、期货从业资格、注册会计师(CPA"
|
||||
")、理财规划师、税务师、精算师-金融数学、经济师、证券从业资格、银行从业资格等相关金融行业知识",
|
||||
"score_name": "finance_score"
|
||||
},
|
||||
{
|
||||
"dimension": "存储",
|
||||
"description": "存储",
|
||||
"score_name": "storage_score"
|
||||
},
|
||||
{
|
||||
"dimension": "医疗",
|
||||
"description": "涵盖中医科、儿科、内科、口腔科、外科、妇产科、心理科学、急诊科、感染与免疫科、生殖健康科、男性健康科、皮肤性病科、眼耳鼻喉科、神经科学、肿瘤科等医疗相关领域",
|
||||
"score_name": "medical_score"
|
||||
}
|
||||
]
|
||||
16
runtime/ops/llms/text_quality_evaluation/metadata.yml
Normal file
16
runtime/ops/llms/text_quality_evaluation/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '文本质量评估'
|
||||
name_en: 'Text Quality Evaluation'
|
||||
description: '通过用户维度和相应描述进行文本评估。'
|
||||
description_en: 'Text evaluation is performed based on user dimensions and corresponding descriptions.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'TextQualityEvaluation'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'consolidate'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
113
runtime/ops/llms/text_quality_evaluation/process.py
Normal file
113
runtime/ops/llms/text_quality_evaluation/process.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 基于LLM通过用户设置维度和相应描述进行文本质量评估
|
||||
Create: 2025/3/14 11:00
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from functools import partial
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils.text_splitter import TextSplitter
|
||||
from datamate.core.base_op import LLM
|
||||
from .constant import EVAL_DIMENSION_MAP, BUSINESS_EVAL_DIMENSION_MAP
|
||||
from .prompt_config import TEXT_QUALITY_EVALUATE_TEMPLATE
|
||||
|
||||
CHUNK_SIZE = 4000
|
||||
CHUNK_OVERLAP = 0
|
||||
|
||||
|
||||
class TextQualityEvaluation(LLM):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TextQualityEvaluation, self).__init__(*args, **kwargs)
|
||||
self.total_length = 0
|
||||
self.text_list = []
|
||||
self.total_scores = [0, 0, 0, 0, 0, 0]
|
||||
self.text_splitter = TextSplitter(1024 * 1024, CHUNK_SIZE, CHUNK_OVERLAP)
|
||||
self.pattern = r'\d+\.\d+'
|
||||
self.task_id = kwargs.get("taskId", "default_id")
|
||||
|
||||
self.llm = self.get_llm(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
tmp_text_list = self.text_splitter.split_text(sample[self.text_key])
|
||||
logger.info(f"task id: {self.task_id}, the length of chunks: {len(tmp_text_list)}")
|
||||
self.text_list = tmp_text_list
|
||||
text_res = {}
|
||||
self._evaluate_concurrently_text(text_res)
|
||||
|
||||
sample[self.text_key] = "Success"
|
||||
self.save_sample([text_res], sample)
|
||||
cost_time = time.time() - start
|
||||
logger.info(f"task id: {self.task_id}, method: TextQualityEvaluation costs {cost_time:.6f} s")
|
||||
self.text_list = []
|
||||
return sample
|
||||
|
||||
def _evaluate_concurrently_text(self, text_res, max_workers: int = 5):
|
||||
for eval_dimension in EVAL_DIMENSION_MAP + BUSINESS_EVAL_DIMENSION_MAP:
|
||||
text_res[eval_dimension["score_name"]] = 0
|
||||
self.total_scores = [0, 0, 0, 0, 0, 0]
|
||||
self.total_length = 0
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# 使用 partial 绑定多参数
|
||||
future_to_params = {
|
||||
executor.submit(
|
||||
partial(self.get_current_score_concurrently, text)): text
|
||||
for text in self.text_list
|
||||
}
|
||||
for future in as_completed(future_to_params):
|
||||
self.parse_execute_result(future, future_to_params)
|
||||
for _, eval_dimension in enumerate(EVAL_DIMENSION_MAP + BUSINESS_EVAL_DIMENSION_MAP):
|
||||
total_score = self.total_scores[_]
|
||||
text_res[eval_dimension["score_name"]] = 0
|
||||
if self.total_length > 0:
|
||||
text_res[eval_dimension["score_name"]] = total_score / self.total_length
|
||||
|
||||
def parse_execute_result(self, future, future_to_params):
|
||||
text = future_to_params[future]
|
||||
try:
|
||||
scores = future.result()
|
||||
if scores and len(scores) == len(self.total_scores):
|
||||
self.total_length += len(text)
|
||||
for _, score in enumerate(scores):
|
||||
self.total_scores[_] = self.total_scores[_] + score * len(text)
|
||||
except Exception as e:
|
||||
logger.error(f"Evaluate error, error details: {e}")
|
||||
|
||||
def get_current_score_concurrently(self, text, retry: int = 2):
|
||||
dimension_list = []
|
||||
for eval_dimension in EVAL_DIMENSION_MAP + BUSINESS_EVAL_DIMENSION_MAP:
|
||||
dimension = eval_dimension["dimension"] + ":" + eval_dimension["description"]
|
||||
dimension_list.append(dimension)
|
||||
prompt = TEXT_QUALITY_EVALUATE_TEMPLATE.format(context=text, dimension0=dimension_list[0],
|
||||
dimension1=dimension_list[1], dimension2=dimension_list[2],
|
||||
dimension3=dimension_list[3], dimension4=dimension_list[4],
|
||||
dimension5=dimension_list[5])
|
||||
retry_time = 0
|
||||
while True:
|
||||
try:
|
||||
return self.get_scores(prompt)
|
||||
except RuntimeError as e:
|
||||
if retry_time < retry:
|
||||
retry_time += 1
|
||||
else:
|
||||
logger.warning(f"Request LLM error, details: {e}")
|
||||
return []
|
||||
|
||||
def get_scores(self, prompt):
|
||||
response = self.llm(prompt)
|
||||
scores_str_list = response.split(",")
|
||||
scores = []
|
||||
for scores_str in scores_str_list:
|
||||
decimals = re.findall(self.pattern, scores_str)
|
||||
if decimals:
|
||||
score = float(decimals[-1])
|
||||
if 0 <= score <= 1:
|
||||
scores.append(score)
|
||||
logger.info(f"current evaluate scores: {scores}")
|
||||
return scores
|
||||
32
runtime/ops/llms/text_quality_evaluation/prompt_config.py
Normal file
32
runtime/ops/llms/text_quality_evaluation/prompt_config.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: prompt 配置文件
|
||||
Create: 2024/02/07
|
||||
"""
|
||||
|
||||
TEXT_QUALITY_EVALUATE_TEMPLATE = """
|
||||
===
|
||||
<Role>:
|
||||
你是一位擅长文本质量评估的数据处理专家。
|
||||
|
||||
===
|
||||
<Instructions>:
|
||||
你擅长根据已知的Context内容, 结合每个评估标准Dimension,给出该标准下文本质量评估结果,结果为0-1的小数:
|
||||
- 充分理解Context内容,质量评估时要覆盖Context的主要内容,不能随意臆想和编造。
|
||||
- 如果你对自己的判断没有较强的信心,直接算作不满足标准,输出0.0分。
|
||||
- 总计会有六个评估标准,分别是Dimension1~Dimension6,每个评估标准都需要给出对应标准下的评估分数,分数为0-1的小数。
|
||||
- 每个评估标注都只输出最终的打分,不能输出额外的内容;每个评估标准的评估结果之间用英文逗号“,”分开。
|
||||
===
|
||||
<Task>
|
||||
请基于下面的参考信息和<Instructions>,生成符合要求的内容。
|
||||
输入:
|
||||
参考信息Context是: "{context}"
|
||||
第一个评估标准Dimension0是: "{dimension0}"
|
||||
第二个评估标准Dimension1是: "{dimension1}"
|
||||
第三个评估标准Dimension2是: "{dimension2}"
|
||||
第四个评估标准Dimension3是: "{dimension3}"
|
||||
第五个评估标准Dimension4是: "{dimension4}"
|
||||
第六个评估标准Dimension5是: "{dimension5}"
|
||||
输出:
|
||||
"""
|
||||
@@ -0,0 +1,98 @@
|
||||
{
|
||||
"对文本逻辑连贯性的评分,范围1-5分": [
|
||||
{
|
||||
"question": "今天天气很好,我吃了苹果。数学题很难,天空是蓝色的。狗会叫,鸟会飞。1234567890。",
|
||||
"answer": "1",
|
||||
"evaluate": "这是一段完全没有逻辑的文字,主题不断跳跃,没有任何结构可循。",
|
||||
"result": "1"
|
||||
},
|
||||
{
|
||||
"question": "我今天早上吃了面包,然后去了公园。天气很好,但突然下起了雨。我思考人生的意义,然后决定回家吃冰淇淋。",
|
||||
"answer": "2",
|
||||
"evaluate": "内容尚可理解,但逻辑连贯性较差,主题跳跃明显。",
|
||||
"result": "2"
|
||||
},
|
||||
{
|
||||
"question": "人工智能正在改变世界。它可以帮助我们解决复杂的问题,但也带来了伦理挑战。例如,自动驾驶汽车需要做出道德决策。此外,人工智能还可以用于医疗诊断。",
|
||||
"answer": "3",
|
||||
"evaluate": "内容结构尚可,逻辑基本连贯,但存在少量混乱或跳跃。",
|
||||
"result": "3"
|
||||
},
|
||||
{
|
||||
"question": "人工智能正在改变世界。它可以帮助我们解决复杂的问题,但也带来了伦理挑战。例如,自动驾驶汽车需要做出道德决策。此外,人工智能还可以用于医疗诊断。这些应用展示了其潜力和局限性。",
|
||||
"answer": "4",
|
||||
"evaluate": "内容结构清晰,逻辑连贯,仅有极小混乱或跳跃。",
|
||||
"result": "4"
|
||||
},
|
||||
{
|
||||
"question": "人工智能正在改变世界。它可以帮助我们解决复杂的问题,但也带来了伦理挑战。例如,自动驾驶汽车需要做出道德决策。此外,人工智能还可以用于医疗诊断。这些应用展示了其潜力和局限性,同时也引发了关于技术与人类关系的深入讨论。",
|
||||
"answer": "5",
|
||||
"evaluate": "内容结构清晰,逻辑严密,无任何混乱或跳跃。",
|
||||
"result": "5"
|
||||
}
|
||||
],
|
||||
"对文本格式一致性的评分,范围1-5分": [
|
||||
{
|
||||
"question": "巴黎的埃菲尔铁塔很高,伦敦的塔桥很老,纽约的自由女神像很美。东京的涩谷很有名,新加坡的滨海湾很繁华。",
|
||||
"answer": "1",
|
||||
"evaluate": "这是一段完全没有格式一致性的文字,段落之间没有任何分隔,内容完全混乱。",
|
||||
"result": "1"
|
||||
},
|
||||
{
|
||||
"question": "巴黎的埃菲尔铁塔很高,伦敦的塔桥很老,纽约的自由女神像很美。东京的涩谷很有名,新加坡的滨海湾很繁华。这些地方都很有特色,但描述方式不统一。",
|
||||
"answer": "2",
|
||||
"evaluate": "内容尚可理解,但格式一致性较差,段落之间没有任何分隔,存在较多格式混乱。",
|
||||
"result": "2"
|
||||
},
|
||||
{
|
||||
"question": "巴黎的埃菲尔铁塔很高。伦敦的塔桥很老。纽约的自由女神像很美。东京的涩谷很有名。新加坡的滨海湾很繁华。这些地方都有独特的建筑风格。",
|
||||
"answer": "3",
|
||||
"evaluate": "内容结构尚可,格式基本一致,但存在少量格式混乱或不一致。",
|
||||
"result": "3"
|
||||
},
|
||||
{
|
||||
"question": "巴黎的埃菲尔铁塔很高。\n伦敦的塔桥很老。\n纽约的自由女神像很美。\n东京的涩谷很有名。\n新加坡的滨海湾很繁华。\n这些地方都有独特的建筑风格。",
|
||||
"answer": "4",
|
||||
"evaluate": "内容结构清晰,格式一致,仅有极小格式混乱或不一致。",
|
||||
"result": "4"
|
||||
},
|
||||
{
|
||||
"question": "### 世界著名建筑\n- **巴黎的埃菲尔铁塔**:高耸入云,象征浪漫。\n- **伦敦的塔桥**:历史悠久,充满工业风格。\n- **纽约的自由女神像**:象征自由,举世闻名。\n- **东京的涩谷**:现代都市的代表,充满活力。\n- **新加坡的滨海湾**:融合自然与现代建筑,令人惊叹。\n\n这些地方都有独特的建筑风格,展现了不同的文化特色。",
|
||||
"answer": "5",
|
||||
"evaluate": "内容结构清晰,格式完全一致,无任何混乱或格式错误。",
|
||||
"result": "5"
|
||||
}
|
||||
],
|
||||
"对文本信息完整性的评分,范围1-5分": [
|
||||
{
|
||||
"question": "这款手机很好。",
|
||||
"answer": "1",
|
||||
"evaluate": "这是一段完全没有信息完整性的文字,内容过于简单,没有任何具体信息。",
|
||||
"result": "1"
|
||||
},
|
||||
{
|
||||
"question": "这款手机很好,屏幕很大。",
|
||||
"answer": "2",
|
||||
"evaluate": "内容尚可理解,但信息完整性较差,缺乏关键细节,如性能、价格等。",
|
||||
"result": "2"
|
||||
},
|
||||
{
|
||||
"question": "这款手机很好,屏幕很大,运行速度快。",
|
||||
"answer": "3",
|
||||
"evaluate": "内容结构尚可,信息基本完整,但存在关键信息遗漏,如摄像头质量、价格等。",
|
||||
"result": "3"
|
||||
},
|
||||
{
|
||||
"question": "这款手机很好,屏幕很大,运行速度快,摄像头也很清晰。",
|
||||
"answer": "4",
|
||||
"evaluate": "内容结构清晰,信息较为完整,仅有少量关键信息遗漏。",
|
||||
"result": "4"
|
||||
},
|
||||
{
|
||||
"question": "### 这款手机的评测\n- **屏幕**:6.7英寸AMOLED,显示效果出色。\n- **性能**:搭载最新处理器,运行速度快,流畅无卡顿。\n- **摄像头**:4800万像素主摄,支持夜景模式,成像清晰。\n- **价格**:起售价为899美元,性价比高。\n- **优点**:屏幕显示效果好,性能强劲。\n- **缺点**:电池容量较小,续航一般。\n\n总体来说,这是一款综合表现优秀的手机。",
|
||||
"answer": "5",
|
||||
"evaluate": "内容结构清晰,信息完整且详细,涵盖了所有关键方面。",
|
||||
"result": "5"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
你将会获得一个问答对,判断问答对是否满足以下标准:
|
||||
标准:"{criterion}"
|
||||
|
||||
要求:
|
||||
1. 结合以上标准,一步一步的分析question文本是否满足标准,这里的question不是指一个问题,只是输入的文本,按照模板输出每个维度的分数,你的result就是分数。额外输入一个维度平均分
|
||||
2. 如果你对自己的判断没有较强的信心,直接算作不满足标准。
|
||||
3. 你的最终裁定应该是1-5的评分,严格按照examples中打分的标准。
|
||||
4. 如果你的回答不符合模板格式和规范,重新思考回答。
|
||||
{examples}
|
||||
问答对:
|
||||
问题:"{question}"
|
||||
答案:"{answer}"
|
||||
|
||||
模板:
|
||||
结果:[1或2或3或4或5]
|
||||
分析思路:XXX
|
||||
"""
|
||||
Reference in New Issue
Block a user