You've already forked DataMate
fix: 修复评估时模型输出json格式不对导致读取错误的问题 (#133)
* feature: add cot data evaluation function * fix: added verification to evaluation results * fix: fix the prompt for evaluating * fix: 修复当评估结果为空导致读取失败的问题
This commit is contained in:
@@ -13,3 +13,41 @@ def call_openai_style_model(base_url, api_key, model_name, prompt, **kwargs):
|
||||
**kwargs
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def _extract_json_substring(raw: str) -> str:
|
||||
"""从 LLM 的原始回答中提取最可能的 JSON 字符串片段。
|
||||
|
||||
处理思路:
|
||||
- 原始回答可能是:说明文字 + JSON + 说明文字,甚至带有 Markdown 代码块。
|
||||
- 优先在文本中查找第一个 '{' 或 '[' 作为 JSON 起始;
|
||||
- 再从后向前找最后一个 '}' 或 ']' 作为结束;
|
||||
- 如果找不到合适的边界,就退回原始字符串。
|
||||
该方法不会保证截取的一定是合法 JSON,但能显著提高 json.loads 的成功率。
|
||||
"""
|
||||
if not raw:
|
||||
return raw
|
||||
|
||||
start = None
|
||||
end = None
|
||||
|
||||
# 查找第一个 JSON 起始符号
|
||||
for i, ch in enumerate(raw):
|
||||
if ch in "[{":
|
||||
start = i
|
||||
break
|
||||
|
||||
# 查找最后一个 JSON 结束符号
|
||||
for i in range(len(raw) - 1, -1, -1):
|
||||
if raw[i] in "]}":
|
||||
end = i + 1 # 切片是左闭右开
|
||||
break
|
||||
|
||||
if start is not None and end is not None and start < end:
|
||||
return raw[start:end].strip()
|
||||
|
||||
# 兜底:去掉常见 Markdown 包裹(```json ... ```)
|
||||
stripped = raw.strip()
|
||||
if stripped.startswith("```"):
|
||||
# 去掉首尾 ``` 标记
|
||||
stripped = stripped.strip("`")
|
||||
return stripped
|
||||
|
||||
@@ -5,6 +5,7 @@ from jsonschema import validate
|
||||
|
||||
class ItemTypes(Enum):
|
||||
QA = "QA"
|
||||
COT = "COT"
|
||||
|
||||
|
||||
class StructuredFileItemHandler:
|
||||
@@ -14,11 +15,26 @@ class StructuredFileItemHandler:
|
||||
def get_item_type(self) -> ItemTypes:
|
||||
pass
|
||||
|
||||
def get_items_from_file(self, file_path: str) -> list[dict]:
|
||||
def validate_json(self, data):
|
||||
pass
|
||||
|
||||
def check_file(self) -> bool:
|
||||
pass
|
||||
def get_items_from_file(self, file_path: str) -> list[dict]:
|
||||
file_type = file_path.split(".")[-1].upper()
|
||||
items = []
|
||||
if file_type == "JSON":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if not self.validate_json(data):
|
||||
return items
|
||||
items = data
|
||||
elif file_type == "JSONL":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
if not self.validate_json(data):
|
||||
continue
|
||||
items.append(data)
|
||||
return items
|
||||
|
||||
class QAItemHandler(StructuredFileItemHandler):
|
||||
def __init__(self):
|
||||
@@ -51,32 +67,44 @@ class QAItemHandler(StructuredFileItemHandler):
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
def get_items_from_file(self, file_path: str) -> list[dict]:
|
||||
file_type = file_path.split(".")[-1].upper()
|
||||
items = []
|
||||
if file_type == "JSON":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if not self.validate_json(data):
|
||||
return items
|
||||
items = data
|
||||
elif file_type == "JSONL":
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
if not self.validate_json(data):
|
||||
continue
|
||||
items.append(data)
|
||||
return items
|
||||
|
||||
def check_file(self) -> bool:
|
||||
pass
|
||||
class COTItemHandler(StructuredFileItemHandler):
|
||||
def __init__(self):
|
||||
self.schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"question": {"type": "string"},
|
||||
"conclusion": {"type": "string"},
|
||||
"chain_of_thought": {"type": "string"}
|
||||
},
|
||||
"required": ["question", "conclusion", "chain_of_thought"],
|
||||
}
|
||||
self.schema_list = {
|
||||
"type": "array",
|
||||
"items": self.schema,
|
||||
}
|
||||
super().__init__()
|
||||
|
||||
def get_item_type(self):
|
||||
return ItemTypes.COT
|
||||
|
||||
def validate_json(self, data):
|
||||
try:
|
||||
validate(instance=data, schema=self.schema)
|
||||
return True
|
||||
except Exception as e:
|
||||
try:
|
||||
validate(instance=data, schema=self.schema_list)
|
||||
return True
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
|
||||
class StructuredFileHandlerFactory:
|
||||
def __init__(self):
|
||||
self.handlers: list[StructuredFileItemHandler] = []
|
||||
self.handlers.append(QAItemHandler())
|
||||
self.handlers.append(COTItemHandler())
|
||||
|
||||
def get_handler(self, item_type: str) -> StructuredFileItemHandler:
|
||||
for handler in self.handlers:
|
||||
|
||||
Reference in New Issue
Block a user