diff --git a/examples/flight_search_example.ipynb b/examples/flight_search_example.ipynb
index 887b228b1..b0d6c78d4 100644
--- a/examples/flight_search_example.ipynb
+++ b/examples/flight_search_example.ipynb
@@ -69,6 +69,7 @@
"\n",
"\tif result.done:\n",
"\t\tprint('\\nā Task completed successfully!')\n",
+ "\t\tprint('Extracted content:', result.extracted_content)\n",
"\t\tbreak\n"
]
}
diff --git a/examples/web_navigation_example.ipynb b/examples/web_navigation_example.ipynb
index ec6bbc6c2..aa18f98d3 100644
--- a/examples/web_navigation_example.ipynb
+++ b/examples/web_navigation_example.ipynb
@@ -15,17 +15,12 @@
"metadata": {},
"outputs": [],
"source": [
- "import sys\n",
+ "import datetime\n",
"import os\n",
- "from pathlib import Path\n",
- "\n",
- "notebook_dir = Path(os.getcwd())\n",
- "project_root = str(notebook_dir.parent)\n",
- "sys.path.append(project_root)\n",
- "\n",
"from langchain_openai import ChatOpenAI\n",
"from src.agent.service import AgentService\n",
- "from src.planning.service import PlaningService\n"
+ "from src.agent.service import AgentService\n",
+ "from src.controller.service import ControllerService\n"
]
},
{
@@ -39,10 +34,9 @@
"Go to wikipedia.org, search for \"Artificial Intelligence\", \n",
"find the section about machine learning, and extract the key points.\n",
"\"\"\"\n",
- "\n",
- "agent = AgentService()\n",
+ "controller = ControllerService()\n",
"model = ChatOpenAI(model='gpt-4o')\n",
- "planning_service = PlaningService(task, model, agent, use_vision=True)"
+ "agent = AgentService(task, model, controller, use_vision=True)\n"
]
},
{
@@ -50,20 +44,7 @@
"execution_count": 3,
"metadata": {},
"outputs": [],
- "source": [
- "# Enable interactive mode for better visualization\n",
- "from IPython.display import display, clear_output\n",
- "import time\n",
- "\n",
- "\n",
- "def display_step(step: int, action, result):\n",
- "\tclear_output(wait=True)\n",
- "\tprint(f'Step {step}:\\n')\n",
- "\tprint('Action:')\n",
- "\tprint(action)\n",
- "\tprint('\\nResult:')\n",
- "\tprint(result)"
- ]
+ "source": []
},
{
"cell_type": "code",
@@ -85,13 +66,19 @@
}
],
"source": [
+ "# Run the agent step by step\n",
+ "\n",
"max_steps = 50\n",
"for i in range(max_steps):\n",
- "\taction, result = await planning_service.step()\n",
- "\tdisplay_step(i + 1, action, result)\n",
+ "\tprint(f'\\nš Step {i+1}')\n",
+ "\taction, result = await agent.step()\n",
+ "\n",
+ "\tprint('Action:', action)\n",
+ "\tprint('Result:', result)\n",
"\n",
"\tif result.done:\n",
"\t\tprint('\\nā Task completed successfully!')\n",
+ "\t\tprint('Extracted content:', result.extracted_content)\n",
"\t\tbreak\n"
]
},
diff --git a/src/agent/prompts.py b/src/agent/prompts.py
index 43b2232e2..43cfbc9ae 100644
--- a/src/agent/prompts.py
+++ b/src/agent/prompts.py
@@ -22,31 +22,38 @@ class AgentSystemPrompt:
AGENT_PROMPT = f"""
You are an AI agent that helps users interact with websites.
-
- Your input are all the interactive elements of the current page from which you can choose which to click or input.
+ Your input are all the interactive elements with its context of the current page from.
This is how an input looks like:
- 1:Interactive element
- 3:
- 9:
Interactive element
+ 33:
+ _: Not clickable, only for context
- Additional you get a list of previous actions and their results.
+ In the beginning the list will be empty.
+ On elements with _ you can not click.
+
+ Additional you get a list of your previous actions.
Available actions (choose EXACTLY ONE, not 0 or 2):
{self.default_action_description}
- In the beginning the list will be empty so you have to do google search or go to url.
- To interact with elements, use their index number in the click() or text_input() actions. Make sure the index exists in the list of interactive elements.
- If you need more than the interactive elements from the page you can use the extract_content action.
- At every step you HAVE to choose EXACTLY ONE action.
+ To interact with elements, use their index number in the click_element() or input_text() actions.
+ If you need more text from the page you can use the extract_page_content action.
- Validate if the previous goal is achieved, if not, try to achieve it with the next action.
- If you get stuck, try to find a new element that can help you achieve your goal or if persistent, go back or reload the page.
- Respond with a valid JSON object containing the action, any required parameters and your current goal of this action.
- You can send_user_text or ask_user for clarification if you are completely stuck.
+ Respond with a valid JSON object, containing the valuation_previous_goal, memory, next_goal and your next action to achieve the next goal.
+
+ valuation_previous_goal: valuation of the previous goal if it is achieved or what went wrong.
+ memory: This you can use as a memory to store where you are in your overall task. E.g. if you need to find 10 jobs, you can store the already found jobs here.
+ next_goal: Short description of the next goal you need to achieve.
+
+ If you get stuck and multiple time dont achieve the next_goal, try to find a new element that can help you achieve your task or if persistent, go back or reload the page and try a different approach.
+
+ You can ask_human for clarification if you are completely stuck or if you really need more information.
+
+ If a picture is provided, use it to understand the context and the next action.
+
+ If you are sure you are done you can extract_page_content to get the markdown content and in the next action call done() with the text of the requested result to end the task and wait for further instructions.
- Make sure after filling a field if you need to click a suggestion or if the field is already filled.
"""
return SystemMessage(content=AGENT_PROMPT)
@@ -58,7 +65,6 @@ class AgentMessagePrompt:
def get_user_message(self) -> HumanMessage:
state_description = f"""
Current url: {self.state.url}
-
Interactive elements:
{self.state.dom_items_to_string()}
"""
diff --git a/src/agent/service.py b/src/agent/service.py
index 75aea0583..4eb5ecc26 100644
--- a/src/agent/service.py
+++ b/src/agent/service.py
@@ -36,7 +36,7 @@ class AgentService:
).get_system_message()
print(system_prompt)
- first_message = HumanMessage(content=f'Your task is: {task}')
+ first_message = HumanMessage(content=f'Your main task is: {task}')
# self.messages_all: list[BaseMessage] = []
self.messages: list[BaseMessage] = [system_prompt, first_message]
diff --git a/src/agent/views.py b/src/agent/views.py
index baf76a652..7d176e1ef 100644
--- a/src/agent/views.py
+++ b/src/agent/views.py
@@ -12,8 +12,10 @@ class AskHumanAgentAction(BaseModel):
class AgentOnlyAction(BaseModel):
+ # TODO this is not really and action with function, but more an output only
valuation_previous_goal: str
- goal: str
+ memory: str
+ next_goal: str
ask_human: Optional[AskHumanAgentAction] = None
@@ -32,4 +34,4 @@ class AgentAction(ControllerActions, AgentOnlyAction):
if __name__ == '__main__':
- print(AgentAction(valuation_previous_goal='Failed', goal='Click'))
+ print(AgentAction(valuation_previous_goal='Failed', next_goal='Click', memory=''))
diff --git a/src/browser/service.py b/src/browser/service.py
index 40df0ccdb..d11949ee5 100644
--- a/src/browser/service.py
+++ b/src/browser/service.py
@@ -194,6 +194,13 @@ class BrowserService:
content = MainContentExtractor.extract(driver.page_source, output_format=value) # type: ignore TODO
return content
+ def done(self, text: str):
+ """
+ Ends the task and waits for further instructions.
+ """
+ print(f'Done on page {self.current_state.url}\n\n: {text}')
+ return text
+
def take_screenshot(self, full_page: bool = False) -> str:
"""
Returns a base64 encoded screenshot of the current page.
diff --git a/src/controller/service.py b/src/controller/service.py
index 5cef75d51..a7f104d81 100644
--- a/src/controller/service.py
+++ b/src/controller/service.py
@@ -53,7 +53,8 @@ class ControllerService:
elif action.go_back:
self.browser.go_back()
elif action.done:
- return ControllerActionResult(done=True)
+ self.browser.done(action.done.text)
+ return ControllerActionResult(done=True, extracted_content=action.done.text)
elif action.click_element:
self.browser.click_element_by_index(
action.click_element.id, self.get_cached_browser_state()
diff --git a/src/controller/views.py b/src/controller/views.py
index 607a43b3b..65c8f8104 100644
--- a/src/controller/views.py
+++ b/src/controller/views.py
@@ -22,6 +22,10 @@ class InputTextControllerAction(BaseModel):
text: str
+class DoneControllerAction(BaseModel):
+ text: str
+
+
class ControllerActions(BaseModel):
"""
Controller actions you can use to interact.
@@ -31,7 +35,7 @@ class ControllerActions(BaseModel):
go_to_url: Optional[GoToUrlControllerAction] = None
nothing: Optional[Literal[True]] = None
go_back: Optional[Literal[True]] = None
- done: Optional[Literal[True]] = None
+ done: Optional[DoneControllerAction] = None
click_element: Optional[ClickElementControllerAction] = None
input_text: Optional[InputTextControllerAction] = None
extract_page_content: Optional[Literal[True]] = None
@@ -51,7 +55,7 @@ class ControllerActions(BaseModel):
- Go back to previous page
Example: {"go_back": true}
- Mark entire task as complete
- Example: {"done": true}
+ Example: {"done": {"text": "This is the requested result of the task..."}}
- Click an element by its ID
Example: {"click_element": {"id": 1}}
- Input text into an element by its ID
diff --git a/src/tests/test_kayak_search.py b/src/tests/test_kayak_search.py
index 8f5d9a3c9..675049247 100644
--- a/src/tests/test_kayak_search.py
+++ b/src/tests/test_kayak_search.py
@@ -79,6 +79,7 @@ async def test_kayak_flight_search():
# # check if output is exactly True (boolean)
if result.done:
print('\nā Task completed successfully')
+ print('Extracted content:', result.extracted_content)
break
# time.sleep(0.5)
diff --git a/src/tests/test_mind2web.py b/src/tests/test_mind2web.py
index 535452281..f49d862bc 100644
--- a/src/tests/test_mind2web.py
+++ b/src/tests/test_mind2web.py
@@ -82,6 +82,7 @@ async def test_mind2web_samples():
if result.done:
print('\nā Sample completed successfully')
+ print('Extracted content:', result.extracted_content)
results['successful'] += 1
sample_success = True
break
@@ -167,6 +168,7 @@ async def test_single_mind2web_sample():
if result.done:
print('\nā Task completed successfully')
+ print('Extracted content:', result.extracted_content)
break
else:
print('\nā Failed to complete task in maximum steps')