diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index e30fd06e8..c90b47de8 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -1730,6 +1730,15 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Modify the last action result (that should have is_done=True) to include the judgement if self.history.history[-1].result[-1].is_done: self.history.history[-1].result[-1].judgement = judgement + # Log the judgement verdict + if judgement: + verdict_color = '\033[32m' if judgement.verdict else '\033[31m' + verdict_text = 'āœ… PASS' if judgement.verdict else 'āŒ FAIL' + judge_log = f'\nāš–ļø {verdict_color}Judge Verdict: {verdict_text}\033[0m\n' + if judgement.failure_reason: + judge_log += f' Failure: {judgement.failure_reason}\n' + judge_log += f' {judgement.reasoning}\n' + self.logger.info(judge_log) break else: diff --git a/examples/features/judge_trace.py b/examples/features/judge_trace.py index b17489063..7d2033050 100644 --- a/examples/features/judge_trace.py +++ b/examples/features/judge_trace.py @@ -21,20 +21,15 @@ from browser_use.llm import ChatGoogle async def main(): llm = ChatBrowserUse() - # Use Claude for judging since it supports vision + structured output + # Use gemini flash for judging since it supports vision + structured output judge_llm = ChatGoogle(model='gemini-flash-latest') task = "Search Google for 'what is browser automation' and tell me the top 3 results" - agent = Agent(task=task, llm=llm, judge_llm=judge_llm) + agent = Agent(task=task, llm=llm, use_judge=True, judge_llm=judge_llm) history = await agent.run() - # Print the judgement result + # Get the judgement result if history.is_judged(): judgement = history.judgement() - print('\n' + '=' * 80) - print('JUDGE EVALUATION') - print(judgement) - else: - print('\nNo judgement available (task may not have completed or use_judge=False)') if __name__ == '__main__':