From e91cd2da9d5805d40554356ddf98d1bfa44d33cc Mon Sep 17 00:00:00 2001 From: magmueller Date: Mon, 4 Nov 2024 19:22:24 +0100 Subject: [PATCH] Included memory, improved prompt, and done function --- examples/flight_search_example.ipynb | 1 + examples/web_navigation_example.ipynb | 41 +++++++++------------------ src/agent/prompts.py | 38 ++++++++++++++----------- src/agent/service.py | 2 +- src/agent/views.py | 6 ++-- src/browser/service.py | 7 +++++ src/controller/service.py | 3 +- src/controller/views.py | 8 ++++-- src/tests/test_kayak_search.py | 1 + src/tests/test_mind2web.py | 2 ++ 10 files changed, 60 insertions(+), 49 deletions(-) diff --git a/examples/flight_search_example.ipynb b/examples/flight_search_example.ipynb index 887b228b1..b0d6c78d4 100644 --- a/examples/flight_search_example.ipynb +++ b/examples/flight_search_example.ipynb @@ -69,6 +69,7 @@ "\n", "\tif result.done:\n", "\t\tprint('\\nāœ… Task completed successfully!')\n", + "\t\tprint('Extracted content:', result.extracted_content)\n", "\t\tbreak\n" ] } diff --git a/examples/web_navigation_example.ipynb b/examples/web_navigation_example.ipynb index ec6bbc6c2..aa18f98d3 100644 --- a/examples/web_navigation_example.ipynb +++ b/examples/web_navigation_example.ipynb @@ -15,17 +15,12 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", + "import datetime\n", "import os\n", - "from pathlib import Path\n", - "\n", - "notebook_dir = Path(os.getcwd())\n", - "project_root = str(notebook_dir.parent)\n", - "sys.path.append(project_root)\n", - "\n", "from langchain_openai import ChatOpenAI\n", "from src.agent.service import AgentService\n", - "from src.planning.service import PlaningService\n" + "from src.agent.service import AgentService\n", + "from src.controller.service import ControllerService\n" ] }, { @@ -39,10 +34,9 @@ "Go to wikipedia.org, search for \"Artificial Intelligence\", \n", "find the section about machine learning, and extract the key points.\n", "\"\"\"\n", - "\n", - "agent = AgentService()\n", + "controller = ControllerService()\n", "model = ChatOpenAI(model='gpt-4o')\n", - "planning_service = PlaningService(task, model, agent, use_vision=True)" + "agent = AgentService(task, model, controller, use_vision=True)\n" ] }, { @@ -50,20 +44,7 @@ "execution_count": 3, "metadata": {}, "outputs": [], - "source": [ - "# Enable interactive mode for better visualization\n", - "from IPython.display import display, clear_output\n", - "import time\n", - "\n", - "\n", - "def display_step(step: int, action, result):\n", - "\tclear_output(wait=True)\n", - "\tprint(f'Step {step}:\\n')\n", - "\tprint('Action:')\n", - "\tprint(action)\n", - "\tprint('\\nResult:')\n", - "\tprint(result)" - ] + "source": [] }, { "cell_type": "code", @@ -85,13 +66,19 @@ } ], "source": [ + "# Run the agent step by step\n", + "\n", "max_steps = 50\n", "for i in range(max_steps):\n", - "\taction, result = await planning_service.step()\n", - "\tdisplay_step(i + 1, action, result)\n", + "\tprint(f'\\nšŸ“ Step {i+1}')\n", + "\taction, result = await agent.step()\n", + "\n", + "\tprint('Action:', action)\n", + "\tprint('Result:', result)\n", "\n", "\tif result.done:\n", "\t\tprint('\\nāœ… Task completed successfully!')\n", + "\t\tprint('Extracted content:', result.extracted_content)\n", "\t\tbreak\n" ] }, diff --git a/src/agent/prompts.py b/src/agent/prompts.py index 43b2232e2..43cfbc9ae 100644 --- a/src/agent/prompts.py +++ b/src/agent/prompts.py @@ -22,31 +22,38 @@ class AgentSystemPrompt: AGENT_PROMPT = f""" You are an AI agent that helps users interact with websites. - - Your input are all the interactive elements of the current page from which you can choose which to click or input. + Your input are all the interactive elements with its context of the current page from. This is how an input looks like: - 1:Interactive element - 3: - 9:
Interactive element
+ 33: + _: Not clickable, only for context - Additional you get a list of previous actions and their results. + In the beginning the list will be empty. + On elements with _ you can not click. + + Additional you get a list of your previous actions. Available actions (choose EXACTLY ONE, not 0 or 2): {self.default_action_description} - In the beginning the list will be empty so you have to do google search or go to url. - To interact with elements, use their index number in the click() or text_input() actions. Make sure the index exists in the list of interactive elements. - If you need more than the interactive elements from the page you can use the extract_content action. - At every step you HAVE to choose EXACTLY ONE action. + To interact with elements, use their index number in the click_element() or input_text() actions. + If you need more text from the page you can use the extract_page_content action. - Validate if the previous goal is achieved, if not, try to achieve it with the next action. - If you get stuck, try to find a new element that can help you achieve your goal or if persistent, go back or reload the page. - Respond with a valid JSON object containing the action, any required parameters and your current goal of this action. - You can send_user_text or ask_user for clarification if you are completely stuck. + Respond with a valid JSON object, containing the valuation_previous_goal, memory, next_goal and your next action to achieve the next goal. + + valuation_previous_goal: valuation of the previous goal if it is achieved or what went wrong. + memory: This you can use as a memory to store where you are in your overall task. E.g. if you need to find 10 jobs, you can store the already found jobs here. + next_goal: Short description of the next goal you need to achieve. + + If you get stuck and multiple time dont achieve the next_goal, try to find a new element that can help you achieve your task or if persistent, go back or reload the page and try a different approach. + + You can ask_human for clarification if you are completely stuck or if you really need more information. + + If a picture is provided, use it to understand the context and the next action. + + If you are sure you are done you can extract_page_content to get the markdown content and in the next action call done() with the text of the requested result to end the task and wait for further instructions. - Make sure after filling a field if you need to click a suggestion or if the field is already filled. """ return SystemMessage(content=AGENT_PROMPT) @@ -58,7 +65,6 @@ class AgentMessagePrompt: def get_user_message(self) -> HumanMessage: state_description = f""" Current url: {self.state.url} - Interactive elements: {self.state.dom_items_to_string()} """ diff --git a/src/agent/service.py b/src/agent/service.py index 75aea0583..4eb5ecc26 100644 --- a/src/agent/service.py +++ b/src/agent/service.py @@ -36,7 +36,7 @@ class AgentService: ).get_system_message() print(system_prompt) - first_message = HumanMessage(content=f'Your task is: {task}') + first_message = HumanMessage(content=f'Your main task is: {task}') # self.messages_all: list[BaseMessage] = [] self.messages: list[BaseMessage] = [system_prompt, first_message] diff --git a/src/agent/views.py b/src/agent/views.py index baf76a652..7d176e1ef 100644 --- a/src/agent/views.py +++ b/src/agent/views.py @@ -12,8 +12,10 @@ class AskHumanAgentAction(BaseModel): class AgentOnlyAction(BaseModel): + # TODO this is not really and action with function, but more an output only valuation_previous_goal: str - goal: str + memory: str + next_goal: str ask_human: Optional[AskHumanAgentAction] = None @@ -32,4 +34,4 @@ class AgentAction(ControllerActions, AgentOnlyAction): if __name__ == '__main__': - print(AgentAction(valuation_previous_goal='Failed', goal='Click')) + print(AgentAction(valuation_previous_goal='Failed', next_goal='Click', memory='')) diff --git a/src/browser/service.py b/src/browser/service.py index 40df0ccdb..d11949ee5 100644 --- a/src/browser/service.py +++ b/src/browser/service.py @@ -194,6 +194,13 @@ class BrowserService: content = MainContentExtractor.extract(driver.page_source, output_format=value) # type: ignore TODO return content + def done(self, text: str): + """ + Ends the task and waits for further instructions. + """ + print(f'Done on page {self.current_state.url}\n\n: {text}') + return text + def take_screenshot(self, full_page: bool = False) -> str: """ Returns a base64 encoded screenshot of the current page. diff --git a/src/controller/service.py b/src/controller/service.py index 5cef75d51..a7f104d81 100644 --- a/src/controller/service.py +++ b/src/controller/service.py @@ -53,7 +53,8 @@ class ControllerService: elif action.go_back: self.browser.go_back() elif action.done: - return ControllerActionResult(done=True) + self.browser.done(action.done.text) + return ControllerActionResult(done=True, extracted_content=action.done.text) elif action.click_element: self.browser.click_element_by_index( action.click_element.id, self.get_cached_browser_state() diff --git a/src/controller/views.py b/src/controller/views.py index 607a43b3b..65c8f8104 100644 --- a/src/controller/views.py +++ b/src/controller/views.py @@ -22,6 +22,10 @@ class InputTextControllerAction(BaseModel): text: str +class DoneControllerAction(BaseModel): + text: str + + class ControllerActions(BaseModel): """ Controller actions you can use to interact. @@ -31,7 +35,7 @@ class ControllerActions(BaseModel): go_to_url: Optional[GoToUrlControllerAction] = None nothing: Optional[Literal[True]] = None go_back: Optional[Literal[True]] = None - done: Optional[Literal[True]] = None + done: Optional[DoneControllerAction] = None click_element: Optional[ClickElementControllerAction] = None input_text: Optional[InputTextControllerAction] = None extract_page_content: Optional[Literal[True]] = None @@ -51,7 +55,7 @@ class ControllerActions(BaseModel): - Go back to previous page Example: {"go_back": true} - Mark entire task as complete - Example: {"done": true} + Example: {"done": {"text": "This is the requested result of the task..."}} - Click an element by its ID Example: {"click_element": {"id": 1}} - Input text into an element by its ID diff --git a/src/tests/test_kayak_search.py b/src/tests/test_kayak_search.py index 8f5d9a3c9..675049247 100644 --- a/src/tests/test_kayak_search.py +++ b/src/tests/test_kayak_search.py @@ -79,6 +79,7 @@ async def test_kayak_flight_search(): # # check if output is exactly True (boolean) if result.done: print('\nāœ… Task completed successfully') + print('Extracted content:', result.extracted_content) break # time.sleep(0.5) diff --git a/src/tests/test_mind2web.py b/src/tests/test_mind2web.py index 535452281..f49d862bc 100644 --- a/src/tests/test_mind2web.py +++ b/src/tests/test_mind2web.py @@ -82,6 +82,7 @@ async def test_mind2web_samples(): if result.done: print('\nāœ… Sample completed successfully') + print('Extracted content:', result.extracted_content) results['successful'] += 1 sample_success = True break @@ -167,6 +168,7 @@ async def test_single_mind2web_sample(): if result.done: print('\nāœ… Task completed successfully') + print('Extracted content:', result.extracted_content) break else: print('\nāŒ Failed to complete task in maximum steps')