From e91cd2da9d5805d40554356ddf98d1bfa44d33cc Mon Sep 17 00:00:00 2001
From: magmueller <mamagnus00@gmail.com>
Date: Mon, 4 Nov 2024 19:22:24 +0100
Subject: [PATCH] Included memory, improved prompt, and done function

---
 examples/flight_search_example.ipynb  |  1 +
 examples/web_navigation_example.ipynb | 41 +++++++++------------------
 src/agent/prompts.py                  | 38 ++++++++++++++-----------
 src/agent/service.py                  |  2 +-
 src/agent/views.py                    |  6 ++--
 src/browser/service.py                |  7 +++++
 src/controller/service.py             |  3 +-
 src/controller/views.py               |  8 ++++--
 src/tests/test_kayak_search.py        |  1 +
 src/tests/test_mind2web.py            |  2 ++
 10 files changed, 60 insertions(+), 49 deletions(-)

diff --git a/examples/flight_search_example.ipynb b/examples/flight_search_example.ipynb
index 887b228b1..b0d6c78d4 100644
--- a/examples/flight_search_example.ipynb
+++ b/examples/flight_search_example.ipynb
@@ -69,6 +69,7 @@
     "\n",
     "\tif result.done:\n",
     "\t\tprint('\\n✅ Task completed successfully!')\n",
+    "\t\tprint('Extracted content:', result.extracted_content)\n",
     "\t\tbreak\n"
    ]
   }
diff --git a/examples/web_navigation_example.ipynb b/examples/web_navigation_example.ipynb
index ec6bbc6c2..aa18f98d3 100644
--- a/examples/web_navigation_example.ipynb
+++ b/examples/web_navigation_example.ipynb
@@ -15,17 +15,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import sys\n",
+    "import datetime\n",
     "import os\n",
-    "from pathlib import Path\n",
-    "\n",
-    "notebook_dir = Path(os.getcwd())\n",
-    "project_root = str(notebook_dir.parent)\n",
-    "sys.path.append(project_root)\n",
-    "\n",
     "from langchain_openai import ChatOpenAI\n",
     "from src.agent.service import AgentService\n",
-    "from src.planning.service import PlaningService\n"
+    "from src.agent.service import AgentService\n",
+    "from src.controller.service import ControllerService\n"
    ]
   },
   {
@@ -39,10 +34,9 @@
     "Go to wikipedia.org, search for \"Artificial Intelligence\", \n",
     "find the section about machine learning, and extract the key points.\n",
     "\"\"\"\n",
-    "\n",
-    "agent = AgentService()\n",
+    "controller = ControllerService()\n",
     "model = ChatOpenAI(model='gpt-4o')\n",
-    "planning_service = PlaningService(task, model, agent, use_vision=True)"
+    "agent = AgentService(task, model, controller, use_vision=True)\n"
    ]
   },
   {
@@ -50,20 +44,7 @@
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Enable interactive mode for better visualization\n",
-    "from IPython.display import display, clear_output\n",
-    "import time\n",
-    "\n",
-    "\n",
-    "def display_step(step: int, action, result):\n",
-    "\tclear_output(wait=True)\n",
-    "\tprint(f'Step {step}:\\n')\n",
-    "\tprint('Action:')\n",
-    "\tprint(action)\n",
-    "\tprint('\\nResult:')\n",
-    "\tprint(result)"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
@@ -85,13 +66,19 @@
     }
    ],
    "source": [
+    "# Run the agent step by step\n",
+    "\n",
     "max_steps = 50\n",
     "for i in range(max_steps):\n",
-    "\taction, result = await planning_service.step()\n",
-    "\tdisplay_step(i + 1, action, result)\n",
+    "\tprint(f'\\n📍 Step {i+1}')\n",
+    "\taction, result = await agent.step()\n",
+    "\n",
+    "\tprint('Action:', action)\n",
+    "\tprint('Result:', result)\n",
     "\n",
     "\tif result.done:\n",
     "\t\tprint('\\n✅ Task completed successfully!')\n",
+    "\t\tprint('Extracted content:', result.extracted_content)\n",
     "\t\tbreak\n"
    ]
   },
diff --git a/src/agent/prompts.py b/src/agent/prompts.py
index 43b2232e2..43cfbc9ae 100644
--- a/src/agent/prompts.py
+++ b/src/agent/prompts.py
@@ -22,31 +22,38 @@ class AgentSystemPrompt:
 
 		AGENT_PROMPT = f"""
     You are an AI agent that helps users interact with websites. 
-
-    Your input are all the interactive elements of the current page from which you can choose which to click or input. 
+    Your input are all the interactive elements with its context of the current page from.
     
     This is how an input looks like:
-    1:Interactive element
-    3:	<a href="https://www.ab.de/"></a>
-    9:<div>Interactive element</div>
+    33: <button>Clickable element</button>
+    _: Not clickable, only for context
 
-    Additional you get a list of previous actions and their results.
+    In the beginning the list will be empty.
+	On elements with _ you can not click.
+    
+	Additional you get a list of your previous actions.
 
     Available actions (choose EXACTLY ONE, not 0 or 2):
 
     {self.default_action_description}
 
-    In the beginning the list will be empty so you have to do google search or go to url.
-    To interact with elements, use their index number in the click() or text_input() actions. Make sure the index exists in the list of interactive elements.
-    If you need more than the interactive elements from the page you can use the extract_content action.
-	At every step you HAVE to choose EXACTLY ONE action.
+    To interact with elements, use their index number in the click_element() or input_text() actions. 
+    If you need more text from the page you can use the extract_page_content action.
 
-    Validate if the previous goal is achieved, if not, try to achieve it with the next action.
-    If you get stuck, try to find a new element that can help you achieve your goal or if persistent, go back or reload the page.
-    Respond with a valid JSON object containing the action, any required parameters and your current goal of this action.
-    You can send_user_text or ask_user for clarification if you are completely stuck. 
+	Respond with a valid JSON object, containing the valuation_previous_goal, memory, next_goal and your next action to achieve the next goal.
+    
+	valuation_previous_goal: valuation of the previous goal if it is achieved or what went wrong.
+	memory: This you can use as a memory to store where you are in your overall task. E.g. if you need to find 10 jobs, you can store the already found jobs here.
+	next_goal: Short description of the next goal you need to achieve.
+
+    If you get stuck and multiple time dont achieve the next_goal, try to find a new element that can help you achieve your task or if persistent, go back or reload the page and try a different approach.
+    
+	You can ask_human for clarification if you are completely stuck or if you really need more information. 
+
+	If a picture is provided, use it to understand the context and the next action.
+	
+	If you are sure you are done you can extract_page_content to get the markdown content and in the next action call done() with the text of the requested result to end the task and wait for further instructions.
 
-    Make sure after filling a field if you need to click a suggestion or if the field is already filled.
     """
 		return SystemMessage(content=AGENT_PROMPT)
 
@@ -58,7 +65,6 @@ class AgentMessagePrompt:
 	def get_user_message(self) -> HumanMessage:
 		state_description = f"""
 Current url: {self.state.url}
-		
 Interactive elements:
 {self.state.dom_items_to_string()}
         """
diff --git a/src/agent/service.py b/src/agent/service.py
index 75aea0583..4eb5ecc26 100644
--- a/src/agent/service.py
+++ b/src/agent/service.py
@@ -36,7 +36,7 @@ class AgentService:
 		).get_system_message()
 
 		print(system_prompt)
-		first_message = HumanMessage(content=f'Your task is: {task}')
+		first_message = HumanMessage(content=f'Your main task is: {task}')
 
 		# self.messages_all: list[BaseMessage] = []
 		self.messages: list[BaseMessage] = [system_prompt, first_message]
diff --git a/src/agent/views.py b/src/agent/views.py
index baf76a652..7d176e1ef 100644
--- a/src/agent/views.py
+++ b/src/agent/views.py
@@ -12,8 +12,10 @@ class AskHumanAgentAction(BaseModel):
 
 
 class AgentOnlyAction(BaseModel):
+	# TODO this is not really and action with function, but more an output only
 	valuation_previous_goal: str
-	goal: str
+	memory: str
+	next_goal: str
 
 	ask_human: Optional[AskHumanAgentAction] = None
 
@@ -32,4 +34,4 @@ class AgentAction(ControllerActions, AgentOnlyAction):
 
 
 if __name__ == '__main__':
-	print(AgentAction(valuation_previous_goal='Failed', goal='Click'))
+	print(AgentAction(valuation_previous_goal='Failed', next_goal='Click', memory=''))
diff --git a/src/browser/service.py b/src/browser/service.py
index 40df0ccdb..d11949ee5 100644
--- a/src/browser/service.py
+++ b/src/browser/service.py
@@ -194,6 +194,13 @@ class BrowserService:
 		content = MainContentExtractor.extract(driver.page_source, output_format=value)  # type: ignore TODO
 		return content
 
+	def done(self, text: str):
+		"""
+		Ends the task and waits for further instructions.
+		"""
+		print(f'Done on page {self.current_state.url}\n\n: {text}')
+		return text
+
 	def take_screenshot(self, full_page: bool = False) -> str:
 		"""
 		Returns a base64 encoded screenshot of the current page.
diff --git a/src/controller/service.py b/src/controller/service.py
index 5cef75d51..a7f104d81 100644
--- a/src/controller/service.py
+++ b/src/controller/service.py
@@ -53,7 +53,8 @@ class ControllerService:
 			elif action.go_back:
 				self.browser.go_back()
 			elif action.done:
-				return ControllerActionResult(done=True)
+				self.browser.done(action.done.text)
+				return ControllerActionResult(done=True, extracted_content=action.done.text)
 			elif action.click_element:
 				self.browser.click_element_by_index(
 					action.click_element.id, self.get_cached_browser_state()
diff --git a/src/controller/views.py b/src/controller/views.py
index 607a43b3b..65c8f8104 100644
--- a/src/controller/views.py
+++ b/src/controller/views.py
@@ -22,6 +22,10 @@ class InputTextControllerAction(BaseModel):
 	text: str
 
 
+class DoneControllerAction(BaseModel):
+	text: str
+
+
 class ControllerActions(BaseModel):
 	"""
 	Controller actions you can use to interact.
@@ -31,7 +35,7 @@ class ControllerActions(BaseModel):
 	go_to_url: Optional[GoToUrlControllerAction] = None
 	nothing: Optional[Literal[True]] = None
 	go_back: Optional[Literal[True]] = None
-	done: Optional[Literal[True]] = None
+	done: Optional[DoneControllerAction] = None
 	click_element: Optional[ClickElementControllerAction] = None
 	input_text: Optional[InputTextControllerAction] = None
 	extract_page_content: Optional[Literal[True]] = None
@@ -51,7 +55,7 @@ class ControllerActions(BaseModel):
 - Go back to previous page
   Example: {"go_back": true}
 - Mark entire task as complete
-  Example: {"done": true}
+  Example: {"done": {"text": "This is the requested result of the task..."}}
 - Click an element by its ID
   Example: {"click_element": {"id": 1}}
 - Input text into an element by its ID
diff --git a/src/tests/test_kayak_search.py b/src/tests/test_kayak_search.py
index 8f5d9a3c9..675049247 100644
--- a/src/tests/test_kayak_search.py
+++ b/src/tests/test_kayak_search.py
@@ -79,6 +79,7 @@ async def test_kayak_flight_search():
 			# # check if output is exactly True (boolean)
 			if result.done:
 				print('\n✅ Task completed successfully')
+				print('Extracted content:', result.extracted_content)
 				break
 
 			# time.sleep(0.5)
diff --git a/src/tests/test_mind2web.py b/src/tests/test_mind2web.py
index 535452281..f49d862bc 100644
--- a/src/tests/test_mind2web.py
+++ b/src/tests/test_mind2web.py
@@ -82,6 +82,7 @@ async def test_mind2web_samples():
 
 				if result.done:
 					print('\n✅ Sample completed successfully')
+					print('Extracted content:', result.extracted_content)
 					results['successful'] += 1
 					sample_success = True
 					break
@@ -167,6 +168,7 @@ async def test_single_mind2web_sample():
 
 			if result.done:
 				print('\n✅ Task completed successfully')
+				print('Extracted content:', result.extracted_content)
 				break
 		else:
 			print('\n❌ Failed to complete task in maximum steps')