Included memory, improved prompt, and done function

2026-05-06 17:52:15 +02:00 · 2024-11-04 19:22:24 +01:00
parent b867973e3d
commit e91cd2da9d
10 changed files with 60 additions and 49 deletions
--- a/examples/flight_search_example.ipynb
+++ b/examples/flight_search_example.ipynb
@@ -69,6 +69,7 @@
    "\n",
    "\tif result.done:\n",
    "\t\tprint('\\n✅ Task completed successfully!')\n",
+    "\t\tprint('Extracted content:', result.extracted_content)\n",
    "\t\tbreak\n"
   ]
  }
--- a/examples/web_navigation_example.ipynb
+++ b/examples/web_navigation_example.ipynb
@@ -15,17 +15,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import sys\n",
+    "import datetime\n",
    "import os\n",
-    "from pathlib import Path\n",
-    "\n",
-    "notebook_dir = Path(os.getcwd())\n",
-    "project_root = str(notebook_dir.parent)\n",
-    "sys.path.append(project_root)\n",
-    "\n",
    "from langchain_openai import ChatOpenAI\n",
    "from src.agent.service import AgentService\n",
-    "from src.planning.service import PlaningService\n"
+    "from src.agent.service import AgentService\n",
+    "from src.controller.service import ControllerService\n"
   ]
  },
  {
@@ -39,10 +34,9 @@
    "Go to wikipedia.org, search for \"Artificial Intelligence\", \n",
    "find the section about machine learning, and extract the key points.\n",
    "\"\"\"\n",
-    "\n",
-    "agent = AgentService()\n",
+    "controller = ControllerService()\n",
    "model = ChatOpenAI(model='gpt-4o')\n",
-    "planning_service = PlaningService(task, model, agent, use_vision=True)"
+    "agent = AgentService(task, model, controller, use_vision=True)\n"
   ]
  },
  {
@@ -50,20 +44,7 @@
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
-   "source": [
-    "# Enable interactive mode for better visualization\n",
-    "from IPython.display import display, clear_output\n",
-    "import time\n",
-    "\n",
-    "\n",
-    "def display_step(step: int, action, result):\n",
-    "\tclear_output(wait=True)\n",
-    "\tprint(f'Step {step}:\\n')\n",
-    "\tprint('Action:')\n",
-    "\tprint(action)\n",
-    "\tprint('\\nResult:')\n",
-    "\tprint(result)"
-   ]
+   "source": []
  },
  {
   "cell_type": "code",
@@ -85,13 +66,19 @@
    }
   ],
   "source": [
+    "# Run the agent step by step\n",
+    "\n",
    "max_steps = 50\n",
    "for i in range(max_steps):\n",
-    "\taction, result = await planning_service.step()\n",
-    "\tdisplay_step(i + 1, action, result)\n",
+    "\tprint(f'\\n📍 Step {i+1}')\n",
+    "\taction, result = await agent.step()\n",
+    "\n",
+    "\tprint('Action:', action)\n",
+    "\tprint('Result:', result)\n",
    "\n",
    "\tif result.done:\n",
    "\t\tprint('\\n✅ Task completed successfully!')\n",
+    "\t\tprint('Extracted content:', result.extracted_content)\n",
    "\t\tbreak\n"
   ]
  },
--- a/src/agent/prompts.py
+++ b/src/agent/prompts.py
@@ -22,31 +22,38 @@ class AgentSystemPrompt:

 		AGENT_PROMPT = f"""
    You are an AI agent that helps users interact with websites. 
-
-    Your input are all the interactive elements of the current page from which you can choose which to click or input. 
+    Your input are all the interactive elements with its context of the current page from.
    
    This is how an input looks like:
-    1:Interactive element
-    3:	<a href="https://www.ab.de/"></a>
-    9:<div>Interactive element</div>
+    33: <button>Clickable element</button>
+    _: Not clickable, only for context

-    Additional you get a list of previous actions and their results.
+    In the beginning the list will be empty.
+	On elements with _ you can not click.
+    
+	Additional you get a list of your previous actions.

    Available actions (choose EXACTLY ONE, not 0 or 2):

    {self.default_action_description}

-    In the beginning the list will be empty so you have to do google search or go to url.
-    To interact with elements, use their index number in the click() or text_input() actions. Make sure the index exists in the list of interactive elements.
-    If you need more than the interactive elements from the page you can use the extract_content action.
-	At every step you HAVE to choose EXACTLY ONE action.
+    To interact with elements, use their index number in the click_element() or input_text() actions. 
+    If you need more text from the page you can use the extract_page_content action.

-    Validate if the previous goal is achieved, if not, try to achieve it with the next action.
-    If you get stuck, try to find a new element that can help you achieve your goal or if persistent, go back or reload the page.
-    Respond with a valid JSON object containing the action, any required parameters and your current goal of this action.
-    You can send_user_text or ask_user for clarification if you are completely stuck. 
+	Respond with a valid JSON object, containing the valuation_previous_goal, memory, next_goal and your next action to achieve the next goal.
+    
+	valuation_previous_goal: valuation of the previous goal if it is achieved or what went wrong.
+	memory: This you can use as a memory to store where you are in your overall task. E.g. if you need to find 10 jobs, you can store the already found jobs here.
+	next_goal: Short description of the next goal you need to achieve.
+
+    If you get stuck and multiple time dont achieve the next_goal, try to find a new element that can help you achieve your task or if persistent, go back or reload the page and try a different approach.
+    
+	You can ask_human for clarification if you are completely stuck or if you really need more information. 
+
+	If a picture is provided, use it to understand the context and the next action.
+	
+	If you are sure you are done you can extract_page_content to get the markdown content and in the next action call done() with the text of the requested result to end the task and wait for further instructions.

-    Make sure after filling a field if you need to click a suggestion or if the field is already filled.
    """
 		return SystemMessage(content=AGENT_PROMPT)

@@ -58,7 +65,6 @@ class AgentMessagePrompt:
 	def get_user_message(self) -> HumanMessage:
 		state_description = f"""
 Current url: {self.state.url}
-		
 Interactive elements:
 {self.state.dom_items_to_string()}
        """
--- a/src/agent/service.py
+++ b/src/agent/service.py
@@ -36,7 +36,7 @@ class AgentService:
 		).get_system_message()

 		print(system_prompt)
-		first_message = HumanMessage(content=f'Your task is: {task}')
+		first_message = HumanMessage(content=f'Your main task is: {task}')

 		# self.messages_all: list[BaseMessage] = []
 		self.messages: list[BaseMessage] = [system_prompt, first_message]
--- a/src/agent/views.py
+++ b/src/agent/views.py
@@ -12,8 +12,10 @@ class AskHumanAgentAction(BaseModel):


 class AgentOnlyAction(BaseModel):
+	# TODO this is not really and action with function, but more an output only
 	valuation_previous_goal: str
-	goal: str
+	memory: str
+	next_goal: str

 	ask_human: Optional[AskHumanAgentAction] = None

@@ -32,4 +34,4 @@ class AgentAction(ControllerActions, AgentOnlyAction):


 if __name__ == '__main__':
-	print(AgentAction(valuation_previous_goal='Failed', goal='Click'))
+	print(AgentAction(valuation_previous_goal='Failed', next_goal='Click', memory=''))
--- a/src/browser/service.py
+++ b/src/browser/service.py
@@ -194,6 +194,13 @@ class BrowserService:
 		content = MainContentExtractor.extract(driver.page_source, output_format=value)  # type: ignore TODO
 		return content

+	def done(self, text: str):
+		"""
+		Ends the task and waits for further instructions.
+		"""
+		print(f'Done on page {self.current_state.url}\n\n: {text}')
+		return text
+
 	def take_screenshot(self, full_page: bool = False) -> str:
 		"""
 		Returns a base64 encoded screenshot of the current page.
--- a/src/controller/service.py
+++ b/src/controller/service.py
@@ -53,7 +53,8 @@ class ControllerService:
 			elif action.go_back:
 				self.browser.go_back()
 			elif action.done:
-				return ControllerActionResult(done=True)
+				self.browser.done(action.done.text)
+				return ControllerActionResult(done=True, extracted_content=action.done.text)
 			elif action.click_element:
 				self.browser.click_element_by_index(
 					action.click_element.id, self.get_cached_browser_state()
--- a/src/controller/views.py
+++ b/src/controller/views.py
@@ -22,6 +22,10 @@ class InputTextControllerAction(BaseModel):
 	text: str


+class DoneControllerAction(BaseModel):
+	text: str
+
+
 class ControllerActions(BaseModel):
 	"""
 	Controller actions you can use to interact.
@@ -31,7 +35,7 @@ class ControllerActions(BaseModel):
 	go_to_url: Optional[GoToUrlControllerAction] = None
 	nothing: Optional[Literal[True]] = None
 	go_back: Optional[Literal[True]] = None
-	done: Optional[Literal[True]] = None
+	done: Optional[DoneControllerAction] = None
 	click_element: Optional[ClickElementControllerAction] = None
 	input_text: Optional[InputTextControllerAction] = None
 	extract_page_content: Optional[Literal[True]] = None
@@ -51,7 +55,7 @@ class ControllerActions(BaseModel):
 - Go back to previous page
  Example: {"go_back": true}
 - Mark entire task as complete
-  Example: {"done": true}
+  Example: {"done": {"text": "This is the requested result of the task..."}}
 - Click an element by its ID
  Example: {"click_element": {"id": 1}}
 - Input text into an element by its ID
--- a/src/tests/test_kayak_search.py
+++ b/src/tests/test_kayak_search.py
@@ -79,6 +79,7 @@ async def test_kayak_flight_search():
 			# # check if output is exactly True (boolean)
 			if result.done:
 				print('\n✅ Task completed successfully')
+				print('Extracted content:', result.extracted_content)
 				break

 			# time.sleep(0.5)
--- a/src/tests/test_mind2web.py
+++ b/src/tests/test_mind2web.py
@@ -82,6 +82,7 @@ async def test_mind2web_samples():

 				if result.done:
 					print('\n✅ Sample completed successfully')
+					print('Extracted content:', result.extracted_content)
 					results['successful'] += 1
 					sample_success = True
 					break
@@ -167,6 +168,7 @@ async def test_single_mind2web_sample():

 			if result.done:
 				print('\n✅ Task completed successfully')
+				print('Extracted content:', result.extracted_content)
 				break
 		else:
 			print('\n❌ Failed to complete task in maximum steps')