diff --git a/ais_bench/benchmark/configs/datasets/json_mode_eval/json_mode_eval_gen.py b/ais_bench/benchmark/configs/datasets/json_mode_eval/json_mode_eval_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..a0483bb95c6a58e9132d76dc744c1a1b0b179ea5 --- /dev/null +++ b/ais_bench/benchmark/configs/datasets/json_mode_eval/json_mode_eval_gen.py @@ -0,0 +1,37 @@ +from ais_bench.benchmark.openicl.icl_prompt_template import PromptTemplate +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.datasets import JsonModeEvalDataset, JsonModeEvalEvaluator + +json_mode_eval_reader_cfg = dict( + input_columns=['prompt'], + output_column='completion', + train_split='train', + test_split='test' +) + +JSON_MODE_EVAL_TEMPLATE = '' + +json_mode_eval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=JSON_MODE_EVAL_TEMPLATE + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +json_mode_eval_eval_cfg = dict( + evaluator=dict(type=JsonModeEvalEvaluator), +) + +json_mode_eval_datasets = [ + dict( + abbr='json_mode_eval', + type=JsonModeEvalDataset, + path='/data/dataset/json-mode-eval', # 数据集路径 + reader_cfg=json_mode_eval_reader_cfg, + infer_cfg=json_mode_eval_infer_cfg, + eval_cfg=json_mode_eval_eval_cfg, + ) +] diff --git a/ais_bench/benchmark/datasets/json_mode_eval.py b/ais_bench/benchmark/datasets/json_mode_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..a0608990f2bf074d5d38fbd5d3632bc5755178b0 --- /dev/null +++ b/ais_bench/benchmark/datasets/json_mode_eval.py @@ -0,0 +1,64 @@ +import json +import os +from os import environ +import pandas as pd +from datasets import Dataset, DatasetDict +from ais_bench.benchmark.openicl import BaseEvaluator +from ais_bench.benchmark.registry import LOAD_DATASET + +@LOAD_DATASET.register_module() +class JsonModeEvalDataset: + @staticmethod + def load(path): + path = os.path.abspath(path) + + datasets = {} + for split in ['train', 'test']: + file_path = os.path.join(path, f'{split}.parquet') + + if not os.path.exists(file_path): + continue + + df = pd.read_parquet(file_path) + + required_columns = {'prompt', 'completion', 'schema'} + if not required_columns.issubset(df.columns): + missing = required_columns - set(df.columns) + raise ValueError(f"Parquet Missing: {missing}") + + dataset = Dataset.from_pandas(df) + datasets[split] = dataset + + return DatasetDict(datasets) + +class JsonModeEvalEvaluator(BaseEvaluator): + def is_json(self, s): + try: + json.loads(s) + return True + except (json.JSONDecodeError, TypeError): + return False + + def score(self, predictions, references): + correct = 0 + count = len(predictions) + details = [] + + for pred, ref in zip(predictions, references): + pred_valid = self.is_json(pred) + ref_valid = self.is_json(ref) + both_valid = pred_valid and ref_valid + + if both_valid: + correct += 1 + + details.append({ + 'pred': pred, + 'answer': ref, + 'pred_valid': pred_valid, + 'ref_valid': ref_valid, + 'correct': both_valid + }) + + accuracy = 100 * correct / count if count > 0 else 0 + return {'accuracy': accuracy, 'details': details}