From e4b03e712ea005bbeaf4b725578cd957df24e7aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Magnus=20M=C3=BCller?= <67061560+MagMueller@users.noreply.github.com> Date: Mon, 23 Jun 2025 01:23:29 +0200 Subject: [PATCH] Enhances comprehensive evaluation data handling in TaskResult Updates the TaskResult class to improve the processing of comprehensive evaluation data. The changes include: - Introduces fallback to retrieve 'comprehensive_judge' if 'comprehensive_evaluation' is not present. - Converts enum lists for 'task_categories' and 'error_categories' to string lists for better database compatibility. - Updates payload to include the processed categories and errors, ensuring proper data structure for evaluation results. --- eval/service.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/eval/service.py b/eval/service.py index 2bb670123..4c7b20e49 100644 --- a/eval/service.py +++ b/eval/service.py @@ -596,16 +596,25 @@ class TaskResult: eval_data = self.stage_data.get(Stage.EVALUATE, {}) # Handle comprehensive judge evaluation - comp_eval = eval_data.get('comprehensive_evaluation') + comp_eval = eval_data.get('comprehensive_evaluation') or eval_data.get('comprehensive_judge') if comp_eval: + # Convert enum lists to string lists for database storage + task_categories = comp_eval.get('task_categories', []) + if task_categories and hasattr(task_categories[0], 'value'): + task_categories = [cat.value for cat in task_categories] + + error_categories = comp_eval.get('error_categories', []) + if error_categories and hasattr(error_categories[0], 'value'): + error_categories = [err.value for err in error_categories] + payload.update( { 'comprehensiveJudgeEvaluationSummary': comp_eval.get('task_summary'), 'comprehensiveJudgeEvaluationReasoning': comp_eval.get('reasoning'), 'comprehensiveJudgeEvaluationPassed': comp_eval.get('passed'), 'comprehensiveJudgeEvaluationScore': comp_eval.get('final_score'), - 'comprehensiveJudgeEvaluationCategories': comp_eval.get('task_categories', []), - 'comprehensiveJudgeEvaluationErrors': comp_eval.get('error_categories', []), + 'comprehensiveJudgeEvaluationCategories': task_categories, + 'comprehensiveJudgeEvaluationErrors': error_categories, 'comprehensiveJudgeEvaluationTips': comp_eval.get('improvement_tips', []), 'comprehensiveJudgeEvaluationScores': comp_eval.get('scores'), 'comprehensiveJudgeEvaluationFull': comp_eval, # Include full comprehensive eval data