Revert "Rename extract_structured_data to get_markdown"

This reverts commit 3b46999b95.
This commit is contained in:
Magnus Müller
2025-08-31 13:33:53 -07:00
parent 3b46999b95
commit 21b1295793
10 changed files with 47 additions and 51 deletions

View File

@@ -139,18 +139,18 @@ class AgentMessagePrompt:
if self.browser_state.page_info:
pi = self.browser_state.page_info
pages_above = pi.pixels_above / pi.viewport_height if pi.viewport_height > 0 else 0
elements_text = f'... {self.browser_state.pixels_above} pixels above ({pages_above:.1f} pages) - scroll to see more or extract markdown data if you are looking for specific information ...\n{elements_text}'
elements_text = f'... {self.browser_state.pixels_above} pixels above ({pages_above:.1f} pages) - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}'
else:
elements_text = f'... {self.browser_state.pixels_above} pixels above - scroll to see more or extract markdown data if you are looking for specific information ...\n{elements_text}'
elements_text = f'... {self.browser_state.pixels_above} pixels above - scroll to see more or extract structured data if you are looking for specific information ...\n{elements_text}'
else:
elements_text = f'[Start of page]\n{elements_text}'
if has_content_below:
if self.browser_state.page_info:
pi = self.browser_state.page_info
pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below ({pages_below:.1f} pages) - scroll to see more or extract markdown data if you are looking for specific information ...'
elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below ({pages_below:.1f} pages) - scroll to see more or extract structured data if you are looking for specific information ...'
else:
elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below - scroll to see more or extract markdown data if you are looking for specific information ...'
elements_text = f'{elements_text}\n... {self.browser_state.pixels_below} pixels below - scroll to see more or extract structured data if you are looking for specific information ...'
else:
elements_text = f'{elements_text}\n[End of page]'
else:
@@ -176,7 +176,7 @@ class AgentMessagePrompt:
# Check if current page is a PDF viewer and add appropriate message
pdf_message = ''
if self.browser_state.is_pdf_viewer:
pdf_message = 'PDF viewer cannot be rendered. In this page, DO NOT use the get_markdown action as PDF content cannot be rendered. Use the read_file action on the downloaded PDF in available_file_paths to read the full content.\n\n'
pdf_message = 'PDF viewer cannot be rendered. In this page, DO NOT use the extract_structured_data action as PDF content cannot be rendered. Use the read_file action on the downloaded PDF in available_file_paths to read the full content.\n\n'
# Add recent events if available and requested
recent_events_text = ''

View File

@@ -21,7 +21,7 @@ At every step, your input will consist of:
2. <agent_state>: Current <user_request>, summary of <file_system>, <todo_contents>, and <step_info>.
3. <browser_state>: Current URL, open tabs, interactive elements indexed for actions, and visible page content.
4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements.
5. <read_state> This will be displayed only if your previous action was get_markdown or read_file. This data is only shown in the current step.
5. <read_state> This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step.
</input>
<agent_history>
@@ -81,9 +81,9 @@ Strictly follow these rules while using the browser and navigating the web:
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
- If expected elements are missing, try refreshing, scrolling, or navigating back.
- If the page is not fully loaded, use the wait action.
- You can call get_markdown on specific pages to gather structured semantic information from the entire page, including parts not currently visible. WARNING: This calls an LLM and is expensive.
- Call get_markdown only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the get_markdown tool is EXPENSIVE as it uses a LLM! DO NOT query the same page with the same get_markdown query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract_structured_data only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
@@ -140,7 +140,7 @@ You can output multiple actions in one step. Try to be efficient where it makes
- `input_text` + `click_element_by_index` → Fill form field and submit/search in one step
- `input_text` + `input_text` → Fill multiple form fields
- `click_element_by_index` + `click_element_by_index` → Navigate through multi-step flows (when the page does not navigate between clicks)
- `scroll` with num_pages 10 + `get_markdown` → Scroll to the bottom of the page to load more content before extracting structured data
- `scroll` with num_pages 10 + `extract_structured_data` → Scroll to the bottom of the page to load more content before extracting structured data
- File operations + browser actions
Do not try multiple different paths in one step. Always have one clear goal per step.

View File

@@ -21,7 +21,7 @@ At every step, your input will consist of:
2. <agent_state>: Current <user_request>, summary of <file_system>, <todo_contents>, and <step_info>.
3. <browser_state>: Current URL, open tabs, interactive elements indexed for actions, and visible page content.
4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements.
5. <read_state> This will be displayed only if your previous action was get_markdown or read_file. This data is only shown in the current step.
5. <read_state> This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step.
</input>
<agent_history>
@@ -79,9 +79,9 @@ Strictly follow these rules while using the browser and navigating the web:
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
- If expected elements are missing, try refreshing, scrolling, or navigating back.
- If the page is not fully loaded, use the wait action.
- You can call get_markdown on specific pages to gather structured semantic information from the entire page, including parts not currently visible. WARNING: This calls an LLM and is expensive.
- Call get_markdown only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the get_markdown tool is EXPENSIVE as it uses a LLM! DO NOT query the same page with the same get_markdown query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract_structured_data only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
@@ -137,7 +137,7 @@ You can output multiple actions in one step. Try to be efficient where it makes
- `input_text` + `click_element_by_index` → Fill form field and submit/search in one step
- `input_text` + `input_text` → Fill multiple form fields
- `click_element_by_index` + `click_element_by_index` → Navigate through multi-step flows (when the page does not navigate between clicks)
- `scroll` with num_pages 10 + `get_markdown` → Scroll to the bottom of the page to load more content before extracting structured data
- `scroll` with num_pages 10 + `extract_structured_data` → Scroll to the bottom of the page to load more content before extracting structured data
- File operations + browser actions
Do not try multiple different paths in one step. Always have one clear goal per step.

View File

@@ -21,7 +21,7 @@ At every step, your input will consist of:
2. <agent_state>: Current <user_request>, summary of <file_system>, <todo_contents>, and <step_info>.
3. <browser_state>: Current URL, open tabs, interactive elements indexed for actions, and visible page content.
4. <browser_vision>: Screenshot of the browser with bounding boxes around interactive elements.
5. <read_state> This will be displayed only if your previous action was get_markdown or read_file. This data is only shown in the current step.
5. <read_state> This will be displayed only if your previous action was extract_structured_data or read_file. This data is only shown in the current step.
</input>
<agent_history>
@@ -81,9 +81,9 @@ Strictly follow these rules while using the browser and navigating the web:
- If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
- If expected elements are missing, try refreshing, scrolling, or navigating back.
- If the page is not fully loaded, use the wait action.
- You can call get_markdown on specific pages to gather structured semantic information from the entire page, including parts not currently visible. WARNING: This calls an LLM and is expensive.
- Call get_markdown only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the get_markdown tool is EXPENSIVE as it uses a LLM! DO NOT query the same page with the same get_markdown query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
- Call extract_structured_data only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
- Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
@@ -139,7 +139,7 @@ You can output multiple actions in one step. Try to be efficient where it makes
- `input_text` + `click_element_by_index` → Fill form field and submit/search in one step
- `input_text` + `input_text` → Fill multiple form fields
- `click_element_by_index` + `click_element_by_index` → Navigate through multi-step flows (when the page does not navigate between clicks)
- `scroll` with num_pages 10 + `get_markdown` → Scroll to the bottom of the page to load more content before extracting structured data
- `scroll` with num_pages 10 + `extract_structured_data` → Scroll to the bottom of the page to load more content before extracting structured data
- File operations + browser actions
Do not try multiple different paths in one step. Always have one clear goal per step.

View File

@@ -266,7 +266,7 @@ class BrowserUseServer:
),
types.Tool(
name='browser_extract_content',
description='Extract markdown content from the current page based on a query',
description='Extract structured content from the current page based on a query',
inputSchema={
'type': 'object',
'properties': {
@@ -714,7 +714,7 @@ class BrowserUseServer:
state = await self.browser_session.get_browser_state_summary()
# Use the get_markdown action
# Use the extract_structured_data action
# Create a dynamic action model that matches the tools's expectations
from pydantic import create_model
@@ -722,7 +722,7 @@ class BrowserUseServer:
ExtractAction = create_model(
'ExtractAction',
__base__=ActionModel,
get_markdown=(dict[str, Any], {'query': query, 'extract_links': extract_links}),
extract_structured_data=(dict[str, Any], {'query': query, 'extract_links': extract_links}),
)
action = ExtractAction()

View File

@@ -536,22 +536,19 @@ class Tools(Generic[Context]):
# This action is temporarily disabled as it needs refactoring to use events
@self.registry.action(
"""Get markdown content from the current webpage and extract data from page markdown using an LLM. This is expensive - do not use it repeatedly for the same page.
"""Extract structured, semantic data (e.g. product description, price, all information about XYZ) from the markdown of the current webpage based on a query.
Recommended to be used ONLY when:
- You are sure that you are on the right page for the query
- You know exactly the information you need to extract from the page
- You need semantic analysis of page content (e.g. product description, price, all information about XYZ)
DO NOT call this tool to:
- Get interactive elements like buttons, links, dropdowns, menus, etc.
- If you previously called get_markdown on the same page with the same query, you should not call it again.
- If you previously asked extract_structured_data on the same page with the same query, you should not call it again.
WARNING: This calls an LLM and is expensive. Use sparingly.
Set extract_links=True only if your query requires extracting links/URLs from the page.
Set extract_links=True ONLY if your query requires extracting links/URLs from the page.
Use start_from_char to start extraction from a specific character position (use if extraction was previously truncated and you want more content).
""",
)
async def get_markdown(
async def extract_structured_data(
query: str,
extract_links: bool,
browser_session: BrowserSession,
@@ -605,7 +602,7 @@ Use start_from_char to start extraction from a specific character position (use
original_html_length = content_stats['original_html_chars']
initial_markdown_length = content_stats['initial_markdown_chars']
chars_filtered = content_stats['filtered_chars_removed']
stats_summary = f"""Content processed: {original_html_length:,} HTML chars → {initial_markdown_length:,} initial markdown → {final_filtered_length:,} filtered markdown"""
if start_from_char > 0:
stats_summary += f' (started from char {start_from_char:,})'
@@ -893,37 +890,36 @@ You will be given a query and the markdown of a webpage that has been filtered t
)
# Custom done action for structured output
async def extract_clean_markdown(
self, browser_session: BrowserSession, extract_links: bool = False
) -> tuple[str, dict[str, Any]]:
async def extract_clean_markdown(self, browser_session: BrowserSession, extract_links: bool = False) -> tuple[str, dict[str, Any]]:
"""Extract clean markdown from the current page.
Args:
browser_session: Browser session to extract content from
extract_links: Whether to preserve links in markdown
Returns:
tuple: (clean_markdown_content, content_statistics)
"""
import re
# Get HTML content from current page
cdp_session = await browser_session.get_or_create_cdp_session()
try:
body_id = await cdp_session.cdp_client.send.DOM.getDocument(session_id=cdp_session.session_id)
page_html_result = await cdp_session.cdp_client.send.DOM.getOuterHTML(
params={'backendNodeId': body_id['root']['backendNodeId']}, session_id=cdp_session.session_id
params={'backendNodeId': body_id['root']['backendNodeId']},
session_id=cdp_session.session_id
)
page_html = page_html_result['outerHTML']
current_url = await browser_session.get_current_page_url()
except Exception as e:
raise RuntimeError(f"Couldn't extract page content: {e}")
original_html_length = len(page_html)
# Use html2text for clean markdown conversion
import html2text
h = html2text.HTML2Text()
h.ignore_links = not extract_links
h.ignore_images = True
@@ -932,17 +928,17 @@ You will be given a query and the markdown of a webpage that has been filtered t
h.unicode_snob = True
h.skip_internal_links = True
content = h.handle(page_html)
initial_markdown_length = len(content)
# Minimal cleanup - html2text already does most of the work
content = re.sub(r'%[0-9A-Fa-f]{2}', '', content) # Remove any remaining URL encoding
# Apply light preprocessing to clean up excessive whitespace
content, chars_filtered = self._preprocess_markdown_content(content)
final_filtered_length = len(content)
# Content statistics
stats = {
'url': current_url,
@@ -951,7 +947,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
'filtered_chars_removed': chars_filtered,
'final_filtered_chars': final_filtered_length,
}
return content, stats
def _preprocess_markdown_content(self, content: str, max_newlines: int = 3) -> tuple[str, int]:

View File

@@ -214,7 +214,7 @@ The interactive elements include all clickable and interactive elements on the p
##### `browser_extract_content`
Extract markdown content from the current page using AI.
Extract structured content from the current page using AI.
```typescript
browser_extract_content(query: string, extract_links?: boolean): string

View File

@@ -27,7 +27,7 @@ mode: "wide"
- **`close_tab`** - Close browser tabs
### Content Extraction
- **`get_markdown`** - This extracts the page markdown and query it using an LLM (like a subagent)
- **`extract_structured_data`** - Extract data from webpages using LLM
### Form Controls
- **`get_dropdown_options`** - Get dropdown option values

View File

@@ -12,7 +12,7 @@ from browser_use import Agent, ChatOpenAI
# This uses a bigger model for the planning
# And a smaller model for the page content extraction
# Think of it like a subagent which only task is to extract content from the current page
# THink of it like a subagent which only task is to extract content from the current page
llm = ChatOpenAI(model='gpt-4.1')
small_llm = ChatOpenAI(model='gpt-4.1-mini')
task = 'Find the founders of browser-use in ycombinator, extract all links and open the links one by one'

View File

@@ -24,7 +24,7 @@ Then, use append_file to add the first sentence of the article to "data.md"
Then, read the file to see its content and make sure it's correct.
Finally, share the file with me.
NOTE: DO NOT USE get_markdown action - everything is visible in browser state.
NOTE: DO NOT USE extract_structured_data action - everything is visible in browser state.
""".strip('\n')
llm = ChatOpenAI(model='gpt-4.1-mini')