Included memory, improved prompt, and done function

This commit is contained in:
magmueller
2024-11-04 19:22:24 +01:00
parent b867973e3d
commit e91cd2da9d
10 changed files with 60 additions and 49 deletions

View File

@@ -69,6 +69,7 @@
"\n",
"\tif result.done:\n",
"\t\tprint('\\n✅ Task completed successfully!')\n",
"\t\tprint('Extracted content:', result.extracted_content)\n",
"\t\tbreak\n"
]
}

View File

@@ -15,17 +15,12 @@
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import datetime\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"notebook_dir = Path(os.getcwd())\n",
"project_root = str(notebook_dir.parent)\n",
"sys.path.append(project_root)\n",
"\n",
"from langchain_openai import ChatOpenAI\n",
"from src.agent.service import AgentService\n",
"from src.planning.service import PlaningService\n"
"from src.agent.service import AgentService\n",
"from src.controller.service import ControllerService\n"
]
},
{
@@ -39,10 +34,9 @@
"Go to wikipedia.org, search for \"Artificial Intelligence\", \n",
"find the section about machine learning, and extract the key points.\n",
"\"\"\"\n",
"\n",
"agent = AgentService()\n",
"controller = ControllerService()\n",
"model = ChatOpenAI(model='gpt-4o')\n",
"planning_service = PlaningService(task, model, agent, use_vision=True)"
"agent = AgentService(task, model, controller, use_vision=True)\n"
]
},
{
@@ -50,20 +44,7 @@
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Enable interactive mode for better visualization\n",
"from IPython.display import display, clear_output\n",
"import time\n",
"\n",
"\n",
"def display_step(step: int, action, result):\n",
"\tclear_output(wait=True)\n",
"\tprint(f'Step {step}:\\n')\n",
"\tprint('Action:')\n",
"\tprint(action)\n",
"\tprint('\\nResult:')\n",
"\tprint(result)"
]
"source": []
},
{
"cell_type": "code",
@@ -85,13 +66,19 @@
}
],
"source": [
"# Run the agent step by step\n",
"\n",
"max_steps = 50\n",
"for i in range(max_steps):\n",
"\taction, result = await planning_service.step()\n",
"\tdisplay_step(i + 1, action, result)\n",
"\tprint(f'\\n📍 Step {i+1}')\n",
"\taction, result = await agent.step()\n",
"\n",
"\tprint('Action:', action)\n",
"\tprint('Result:', result)\n",
"\n",
"\tif result.done:\n",
"\t\tprint('\\n✅ Task completed successfully!')\n",
"\t\tprint('Extracted content:', result.extracted_content)\n",
"\t\tbreak\n"
]
},

View File

@@ -22,31 +22,38 @@ class AgentSystemPrompt:
AGENT_PROMPT = f"""
You are an AI agent that helps users interact with websites.
Your input are all the interactive elements of the current page from which you can choose which to click or input.
Your input are all the interactive elements with its context of the current page from.
This is how an input looks like:
1:Interactive element
3: <a href="https://www.ab.de/"></a>
9:<div>Interactive element</div>
33: <button>Clickable element</button>
_: Not clickable, only for context
Additional you get a list of previous actions and their results.
In the beginning the list will be empty.
On elements with _ you can not click.
Additional you get a list of your previous actions.
Available actions (choose EXACTLY ONE, not 0 or 2):
{self.default_action_description}
In the beginning the list will be empty so you have to do google search or go to url.
To interact with elements, use their index number in the click() or text_input() actions. Make sure the index exists in the list of interactive elements.
If you need more than the interactive elements from the page you can use the extract_content action.
At every step you HAVE to choose EXACTLY ONE action.
To interact with elements, use their index number in the click_element() or input_text() actions.
If you need more text from the page you can use the extract_page_content action.
Validate if the previous goal is achieved, if not, try to achieve it with the next action.
If you get stuck, try to find a new element that can help you achieve your goal or if persistent, go back or reload the page.
Respond with a valid JSON object containing the action, any required parameters and your current goal of this action.
You can send_user_text or ask_user for clarification if you are completely stuck.
Respond with a valid JSON object, containing the valuation_previous_goal, memory, next_goal and your next action to achieve the next goal.
valuation_previous_goal: valuation of the previous goal if it is achieved or what went wrong.
memory: This you can use as a memory to store where you are in your overall task. E.g. if you need to find 10 jobs, you can store the already found jobs here.
next_goal: Short description of the next goal you need to achieve.
If you get stuck and multiple time dont achieve the next_goal, try to find a new element that can help you achieve your task or if persistent, go back or reload the page and try a different approach.
You can ask_human for clarification if you are completely stuck or if you really need more information.
If a picture is provided, use it to understand the context and the next action.
If you are sure you are done you can extract_page_content to get the markdown content and in the next action call done() with the text of the requested result to end the task and wait for further instructions.
Make sure after filling a field if you need to click a suggestion or if the field is already filled.
"""
return SystemMessage(content=AGENT_PROMPT)
@@ -58,7 +65,6 @@ class AgentMessagePrompt:
def get_user_message(self) -> HumanMessage:
state_description = f"""
Current url: {self.state.url}
Interactive elements:
{self.state.dom_items_to_string()}
"""

View File

@@ -36,7 +36,7 @@ class AgentService:
).get_system_message()
print(system_prompt)
first_message = HumanMessage(content=f'Your task is: {task}')
first_message = HumanMessage(content=f'Your main task is: {task}')
# self.messages_all: list[BaseMessage] = []
self.messages: list[BaseMessage] = [system_prompt, first_message]

View File

@@ -12,8 +12,10 @@ class AskHumanAgentAction(BaseModel):
class AgentOnlyAction(BaseModel):
# TODO this is not really and action with function, but more an output only
valuation_previous_goal: str
goal: str
memory: str
next_goal: str
ask_human: Optional[AskHumanAgentAction] = None
@@ -32,4 +34,4 @@ class AgentAction(ControllerActions, AgentOnlyAction):
if __name__ == '__main__':
print(AgentAction(valuation_previous_goal='Failed', goal='Click'))
print(AgentAction(valuation_previous_goal='Failed', next_goal='Click', memory=''))

View File

@@ -194,6 +194,13 @@ class BrowserService:
content = MainContentExtractor.extract(driver.page_source, output_format=value) # type: ignore TODO
return content
def done(self, text: str):
"""
Ends the task and waits for further instructions.
"""
print(f'Done on page {self.current_state.url}\n\n: {text}')
return text
def take_screenshot(self, full_page: bool = False) -> str:
"""
Returns a base64 encoded screenshot of the current page.

View File

@@ -53,7 +53,8 @@ class ControllerService:
elif action.go_back:
self.browser.go_back()
elif action.done:
return ControllerActionResult(done=True)
self.browser.done(action.done.text)
return ControllerActionResult(done=True, extracted_content=action.done.text)
elif action.click_element:
self.browser.click_element_by_index(
action.click_element.id, self.get_cached_browser_state()

View File

@@ -22,6 +22,10 @@ class InputTextControllerAction(BaseModel):
text: str
class DoneControllerAction(BaseModel):
text: str
class ControllerActions(BaseModel):
"""
Controller actions you can use to interact.
@@ -31,7 +35,7 @@ class ControllerActions(BaseModel):
go_to_url: Optional[GoToUrlControllerAction] = None
nothing: Optional[Literal[True]] = None
go_back: Optional[Literal[True]] = None
done: Optional[Literal[True]] = None
done: Optional[DoneControllerAction] = None
click_element: Optional[ClickElementControllerAction] = None
input_text: Optional[InputTextControllerAction] = None
extract_page_content: Optional[Literal[True]] = None
@@ -51,7 +55,7 @@ class ControllerActions(BaseModel):
- Go back to previous page
Example: {"go_back": true}
- Mark entire task as complete
Example: {"done": true}
Example: {"done": {"text": "This is the requested result of the task..."}}
- Click an element by its ID
Example: {"click_element": {"id": 1}}
- Input text into an element by its ID

View File

@@ -79,6 +79,7 @@ async def test_kayak_flight_search():
# # check if output is exactly True (boolean)
if result.done:
print('\n✅ Task completed successfully')
print('Extracted content:', result.extracted_content)
break
# time.sleep(0.5)

View File

@@ -82,6 +82,7 @@ async def test_mind2web_samples():
if result.done:
print('\n✅ Sample completed successfully')
print('Extracted content:', result.extracted_content)
results['successful'] += 1
sample_success = True
break
@@ -167,6 +168,7 @@ async def test_single_mind2web_sample():
if result.done:
print('\n✅ Task completed successfully')
print('Extracted content:', result.extracted_content)
break
else:
print('\n❌ Failed to complete task in maximum steps')