mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
Included memory, improved prompt, and done function
This commit is contained in:
@@ -69,6 +69,7 @@
|
||||
"\n",
|
||||
"\tif result.done:\n",
|
||||
"\t\tprint('\\n✅ Task completed successfully!')\n",
|
||||
"\t\tprint('Extracted content:', result.extracted_content)\n",
|
||||
"\t\tbreak\n"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -15,17 +15,12 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"import datetime\n",
|
||||
"import os\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"notebook_dir = Path(os.getcwd())\n",
|
||||
"project_root = str(notebook_dir.parent)\n",
|
||||
"sys.path.append(project_root)\n",
|
||||
"\n",
|
||||
"from langchain_openai import ChatOpenAI\n",
|
||||
"from src.agent.service import AgentService\n",
|
||||
"from src.planning.service import PlaningService\n"
|
||||
"from src.agent.service import AgentService\n",
|
||||
"from src.controller.service import ControllerService\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -39,10 +34,9 @@
|
||||
"Go to wikipedia.org, search for \"Artificial Intelligence\", \n",
|
||||
"find the section about machine learning, and extract the key points.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"agent = AgentService()\n",
|
||||
"controller = ControllerService()\n",
|
||||
"model = ChatOpenAI(model='gpt-4o')\n",
|
||||
"planning_service = PlaningService(task, model, agent, use_vision=True)"
|
||||
"agent = AgentService(task, model, controller, use_vision=True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -50,20 +44,7 @@
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Enable interactive mode for better visualization\n",
|
||||
"from IPython.display import display, clear_output\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def display_step(step: int, action, result):\n",
|
||||
"\tclear_output(wait=True)\n",
|
||||
"\tprint(f'Step {step}:\\n')\n",
|
||||
"\tprint('Action:')\n",
|
||||
"\tprint(action)\n",
|
||||
"\tprint('\\nResult:')\n",
|
||||
"\tprint(result)"
|
||||
]
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -85,13 +66,19 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Run the agent step by step\n",
|
||||
"\n",
|
||||
"max_steps = 50\n",
|
||||
"for i in range(max_steps):\n",
|
||||
"\taction, result = await planning_service.step()\n",
|
||||
"\tdisplay_step(i + 1, action, result)\n",
|
||||
"\tprint(f'\\n📍 Step {i+1}')\n",
|
||||
"\taction, result = await agent.step()\n",
|
||||
"\n",
|
||||
"\tprint('Action:', action)\n",
|
||||
"\tprint('Result:', result)\n",
|
||||
"\n",
|
||||
"\tif result.done:\n",
|
||||
"\t\tprint('\\n✅ Task completed successfully!')\n",
|
||||
"\t\tprint('Extracted content:', result.extracted_content)\n",
|
||||
"\t\tbreak\n"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -22,31 +22,38 @@ class AgentSystemPrompt:
|
||||
|
||||
AGENT_PROMPT = f"""
|
||||
You are an AI agent that helps users interact with websites.
|
||||
|
||||
Your input are all the interactive elements of the current page from which you can choose which to click or input.
|
||||
Your input are all the interactive elements with its context of the current page from.
|
||||
|
||||
This is how an input looks like:
|
||||
1:Interactive element
|
||||
3: <a href="https://www.ab.de/"></a>
|
||||
9:<div>Interactive element</div>
|
||||
33: <button>Clickable element</button>
|
||||
_: Not clickable, only for context
|
||||
|
||||
Additional you get a list of previous actions and their results.
|
||||
In the beginning the list will be empty.
|
||||
On elements with _ you can not click.
|
||||
|
||||
Additional you get a list of your previous actions.
|
||||
|
||||
Available actions (choose EXACTLY ONE, not 0 or 2):
|
||||
|
||||
{self.default_action_description}
|
||||
|
||||
In the beginning the list will be empty so you have to do google search or go to url.
|
||||
To interact with elements, use their index number in the click() or text_input() actions. Make sure the index exists in the list of interactive elements.
|
||||
If you need more than the interactive elements from the page you can use the extract_content action.
|
||||
At every step you HAVE to choose EXACTLY ONE action.
|
||||
To interact with elements, use their index number in the click_element() or input_text() actions.
|
||||
If you need more text from the page you can use the extract_page_content action.
|
||||
|
||||
Validate if the previous goal is achieved, if not, try to achieve it with the next action.
|
||||
If you get stuck, try to find a new element that can help you achieve your goal or if persistent, go back or reload the page.
|
||||
Respond with a valid JSON object containing the action, any required parameters and your current goal of this action.
|
||||
You can send_user_text or ask_user for clarification if you are completely stuck.
|
||||
Respond with a valid JSON object, containing the valuation_previous_goal, memory, next_goal and your next action to achieve the next goal.
|
||||
|
||||
valuation_previous_goal: valuation of the previous goal if it is achieved or what went wrong.
|
||||
memory: This you can use as a memory to store where you are in your overall task. E.g. if you need to find 10 jobs, you can store the already found jobs here.
|
||||
next_goal: Short description of the next goal you need to achieve.
|
||||
|
||||
If you get stuck and multiple time dont achieve the next_goal, try to find a new element that can help you achieve your task or if persistent, go back or reload the page and try a different approach.
|
||||
|
||||
You can ask_human for clarification if you are completely stuck or if you really need more information.
|
||||
|
||||
If a picture is provided, use it to understand the context and the next action.
|
||||
|
||||
If you are sure you are done you can extract_page_content to get the markdown content and in the next action call done() with the text of the requested result to end the task and wait for further instructions.
|
||||
|
||||
Make sure after filling a field if you need to click a suggestion or if the field is already filled.
|
||||
"""
|
||||
return SystemMessage(content=AGENT_PROMPT)
|
||||
|
||||
@@ -58,7 +65,6 @@ class AgentMessagePrompt:
|
||||
def get_user_message(self) -> HumanMessage:
|
||||
state_description = f"""
|
||||
Current url: {self.state.url}
|
||||
|
||||
Interactive elements:
|
||||
{self.state.dom_items_to_string()}
|
||||
"""
|
||||
|
||||
@@ -36,7 +36,7 @@ class AgentService:
|
||||
).get_system_message()
|
||||
|
||||
print(system_prompt)
|
||||
first_message = HumanMessage(content=f'Your task is: {task}')
|
||||
first_message = HumanMessage(content=f'Your main task is: {task}')
|
||||
|
||||
# self.messages_all: list[BaseMessage] = []
|
||||
self.messages: list[BaseMessage] = [system_prompt, first_message]
|
||||
|
||||
@@ -12,8 +12,10 @@ class AskHumanAgentAction(BaseModel):
|
||||
|
||||
|
||||
class AgentOnlyAction(BaseModel):
|
||||
# TODO this is not really and action with function, but more an output only
|
||||
valuation_previous_goal: str
|
||||
goal: str
|
||||
memory: str
|
||||
next_goal: str
|
||||
|
||||
ask_human: Optional[AskHumanAgentAction] = None
|
||||
|
||||
@@ -32,4 +34,4 @@ class AgentAction(ControllerActions, AgentOnlyAction):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(AgentAction(valuation_previous_goal='Failed', goal='Click'))
|
||||
print(AgentAction(valuation_previous_goal='Failed', next_goal='Click', memory=''))
|
||||
|
||||
@@ -194,6 +194,13 @@ class BrowserService:
|
||||
content = MainContentExtractor.extract(driver.page_source, output_format=value) # type: ignore TODO
|
||||
return content
|
||||
|
||||
def done(self, text: str):
|
||||
"""
|
||||
Ends the task and waits for further instructions.
|
||||
"""
|
||||
print(f'Done on page {self.current_state.url}\n\n: {text}')
|
||||
return text
|
||||
|
||||
def take_screenshot(self, full_page: bool = False) -> str:
|
||||
"""
|
||||
Returns a base64 encoded screenshot of the current page.
|
||||
|
||||
@@ -53,7 +53,8 @@ class ControllerService:
|
||||
elif action.go_back:
|
||||
self.browser.go_back()
|
||||
elif action.done:
|
||||
return ControllerActionResult(done=True)
|
||||
self.browser.done(action.done.text)
|
||||
return ControllerActionResult(done=True, extracted_content=action.done.text)
|
||||
elif action.click_element:
|
||||
self.browser.click_element_by_index(
|
||||
action.click_element.id, self.get_cached_browser_state()
|
||||
|
||||
@@ -22,6 +22,10 @@ class InputTextControllerAction(BaseModel):
|
||||
text: str
|
||||
|
||||
|
||||
class DoneControllerAction(BaseModel):
|
||||
text: str
|
||||
|
||||
|
||||
class ControllerActions(BaseModel):
|
||||
"""
|
||||
Controller actions you can use to interact.
|
||||
@@ -31,7 +35,7 @@ class ControllerActions(BaseModel):
|
||||
go_to_url: Optional[GoToUrlControllerAction] = None
|
||||
nothing: Optional[Literal[True]] = None
|
||||
go_back: Optional[Literal[True]] = None
|
||||
done: Optional[Literal[True]] = None
|
||||
done: Optional[DoneControllerAction] = None
|
||||
click_element: Optional[ClickElementControllerAction] = None
|
||||
input_text: Optional[InputTextControllerAction] = None
|
||||
extract_page_content: Optional[Literal[True]] = None
|
||||
@@ -51,7 +55,7 @@ class ControllerActions(BaseModel):
|
||||
- Go back to previous page
|
||||
Example: {"go_back": true}
|
||||
- Mark entire task as complete
|
||||
Example: {"done": true}
|
||||
Example: {"done": {"text": "This is the requested result of the task..."}}
|
||||
- Click an element by its ID
|
||||
Example: {"click_element": {"id": 1}}
|
||||
- Input text into an element by its ID
|
||||
|
||||
@@ -79,6 +79,7 @@ async def test_kayak_flight_search():
|
||||
# # check if output is exactly True (boolean)
|
||||
if result.done:
|
||||
print('\n✅ Task completed successfully')
|
||||
print('Extracted content:', result.extracted_content)
|
||||
break
|
||||
|
||||
# time.sleep(0.5)
|
||||
|
||||
@@ -82,6 +82,7 @@ async def test_mind2web_samples():
|
||||
|
||||
if result.done:
|
||||
print('\n✅ Sample completed successfully')
|
||||
print('Extracted content:', result.extracted_content)
|
||||
results['successful'] += 1
|
||||
sample_success = True
|
||||
break
|
||||
@@ -167,6 +168,7 @@ async def test_single_mind2web_sample():
|
||||
|
||||
if result.done:
|
||||
print('\n✅ Task completed successfully')
|
||||
print('Extracted content:', result.extracted_content)
|
||||
break
|
||||
else:
|
||||
print('\n❌ Failed to complete task in maximum steps')
|
||||
|
||||
Reference in New Issue
Block a user