You've already forked DataMate
* feature: add cot data evaluation function * fix: added verification to evaluation results * fix: fix the prompt for evaluating * fix: 修复当评估结果为空导致读取失败的问题
114 lines
3.3 KiB
Python
114 lines
3.3 KiB
Python
import json
|
|
|
|
from enum import Enum
|
|
from jsonschema import validate
|
|
|
|
class ItemTypes(Enum):
|
|
QA = "QA"
|
|
COT = "COT"
|
|
|
|
|
|
class StructuredFileItemHandler:
|
|
def __init__(self):
|
|
pass
|
|
|
|
def get_item_type(self) -> ItemTypes:
|
|
pass
|
|
|
|
def validate_json(self, data):
|
|
pass
|
|
|
|
def get_items_from_file(self, file_path: str) -> list[dict]:
|
|
file_type = file_path.split(".")[-1].upper()
|
|
items = []
|
|
if file_type == "JSON":
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
if not self.validate_json(data):
|
|
return items
|
|
items = data
|
|
elif file_type == "JSONL":
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
data = json.loads(line)
|
|
if not self.validate_json(data):
|
|
continue
|
|
items.append(data)
|
|
return items
|
|
|
|
class QAItemHandler(StructuredFileItemHandler):
|
|
def __init__(self):
|
|
self.schema_alpaca = {
|
|
"type": "object",
|
|
"properties": {
|
|
"instruction": {"type": "string"},
|
|
"input": {"type": "string"},
|
|
"output": {"type": "string"}
|
|
},
|
|
"required": ["instruction", "output"],
|
|
}
|
|
self.schema_alpaca_list = {
|
|
"type": "array",
|
|
"items": self.schema_alpaca,
|
|
}
|
|
super().__init__()
|
|
|
|
def get_item_type(self):
|
|
return ItemTypes.QA
|
|
|
|
def validate_json(self, data):
|
|
try:
|
|
validate(instance=data, schema=self.schema_alpaca)
|
|
return True
|
|
except Exception as e:
|
|
try:
|
|
validate(instance=data, schema=self.schema_alpaca_list)
|
|
return True
|
|
except Exception as e:
|
|
return False
|
|
|
|
|
|
class COTItemHandler(StructuredFileItemHandler):
|
|
def __init__(self):
|
|
self.schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"question": {"type": "string"},
|
|
"conclusion": {"type": "string"},
|
|
"chain_of_thought": {"type": "string"}
|
|
},
|
|
"required": ["question", "conclusion", "chain_of_thought"],
|
|
}
|
|
self.schema_list = {
|
|
"type": "array",
|
|
"items": self.schema,
|
|
}
|
|
super().__init__()
|
|
|
|
def get_item_type(self):
|
|
return ItemTypes.COT
|
|
|
|
def validate_json(self, data):
|
|
try:
|
|
validate(instance=data, schema=self.schema)
|
|
return True
|
|
except Exception as e:
|
|
try:
|
|
validate(instance=data, schema=self.schema_list)
|
|
return True
|
|
except Exception as e:
|
|
return False
|
|
|
|
|
|
class StructuredFileHandlerFactory:
|
|
def __init__(self):
|
|
self.handlers: list[StructuredFileItemHandler] = []
|
|
self.handlers.append(QAItemHandler())
|
|
self.handlers.append(COTItemHandler())
|
|
|
|
def get_handler(self, item_type: str) -> StructuredFileItemHandler:
|
|
for handler in self.handlers:
|
|
if handler.get_item_type().value == item_type:
|
|
return handler
|
|
raise ValueError(f"Unsupported item type: {item_type}")
|