import json from enum import Enum from jsonschema import validate class ItemTypes(Enum): QA = "QA" COT = "COT" class StructuredFileItemHandler: def __init__(self): pass def get_item_type(self) -> ItemTypes: pass def validate_json(self, data): pass def get_items_from_file(self, file_path: str) -> list[dict]: file_type = file_path.split(".")[-1].upper() items = [] if file_type == "JSON": with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) if not self.validate_json(data): return items items = data elif file_type == "JSONL": with open(file_path, "r", encoding="utf-8") as f: for line in f: data = json.loads(line) if not self.validate_json(data): continue items.append(data) return items class QAItemHandler(StructuredFileItemHandler): def __init__(self): self.schema_alpaca = { "type": "object", "properties": { "instruction": {"type": "string"}, "input": {"type": "string"}, "output": {"type": "string"} }, "required": ["instruction", "output"], } self.schema_alpaca_list = { "type": "array", "items": self.schema_alpaca, } super().__init__() def get_item_type(self): return ItemTypes.QA def validate_json(self, data): try: validate(instance=data, schema=self.schema_alpaca) return True except Exception as e: try: validate(instance=data, schema=self.schema_alpaca_list) return True except Exception as e: return False class COTItemHandler(StructuredFileItemHandler): def __init__(self): self.schema = { "type": "object", "properties": { "question": {"type": "string"}, "conclusion": {"type": "string"}, "chain_of_thought": {"type": "string"} }, "required": ["question", "conclusion", "chain_of_thought"], } self.schema_list = { "type": "array", "items": self.schema, } super().__init__() def get_item_type(self): return ItemTypes.COT def validate_json(self, data): try: validate(instance=data, schema=self.schema) return True except Exception as e: try: validate(instance=data, schema=self.schema_list) return True except Exception as e: return False class StructuredFileHandlerFactory: def __init__(self): self.handlers: list[StructuredFileItemHandler] = [] self.handlers.append(QAItemHandler()) self.handlers.append(COTItemHandler()) def get_handler(self, item_type: str) -> StructuredFileItemHandler: for handler in self.handlers: if handler.get_item_type().value == item_type: return handler raise ValueError(f"Unsupported item type: {item_type}")