improved csv file generation

2026-04-22 17:45:09 +02:00 · 2026-02-25 11:45:05 -08:00
parent 943c93ab5d
commit 8270346ccf
3 changed files with 238 additions and 1 deletions
--- a/browser_use/filesystem/file_system.py
+++ b/browser_use/filesystem/file_system.py
@@ -1,5 +1,7 @@
 import asyncio
 import base64
+import csv
+import io
 import os
 import re
 import shutil
@@ -164,12 +166,68 @@ class JsonFile(BaseFile):


 class CsvFile(BaseFile):
-	"""CSV file implementation"""
+	"""CSV file implementation with automatic RFC 4180 normalization.
+
+	LLMs frequently produce malformed CSV (missing quotes around fields with commas,
+	inconsistent empty fields, unescaped internal quotes). This class parses the raw
+	content through Python's csv module on every write to guarantee well-formed output.
+	"""

 	@property
 	def extension(self) -> str:
 		return 'csv'

+	@staticmethod
+	def _normalize_csv(raw: str) -> str:
+		"""Parse and re-serialize CSV content to fix quoting, empty fields, and escaping.
+
+		Handles common LLM mistakes: unquoted fields containing commas,
+		unescaped quotes inside fields, inconsistent empty fields,
+		trailing/leading blank lines, and double-escaped JSON output
+		(literal backslash-n and backslash-quote instead of real newlines/quotes).
+		"""
+		stripped = raw.strip()
+		if not stripped:
+			return raw
+
+		# Detect double-escaped LLM tool call output: if the content has no real
+		# newlines but contains literal \n sequences, the entire string is likely
+		# double-escaped JSON. Unescape \" → " first, then \n → newline.
+		if '\n' not in stripped and '\\n' in stripped:
+			stripped = stripped.replace('\\"', '"')
+			stripped = stripped.replace('\\n', '\n')
+
+		reader = csv.reader(io.StringIO(stripped))
+		rows: list[list[str]] = []
+		for row in reader:
+			# Skip completely empty rows (artifacts of blank lines)
+			if row:
+				rows.append(row)
+
+		if not rows:
+			return raw
+
+		out = io.StringIO()
+		writer = csv.writer(out, lineterminator='\n')
+		writer.writerows(rows)
+		# Strip trailing newline so callers (write_file action) control line endings
+		return out.getvalue().rstrip('\n')
+
+	def write_file_content(self, content: str) -> None:
+		"""Normalize CSV content before storing."""
+		self.update_content(self._normalize_csv(content))
+
+	def append_file_content(self, content: str) -> None:
+		"""Normalize the appended CSV rows and merge with existing content."""
+		normalized_new = self._normalize_csv(content)
+		if not normalized_new.strip():
+			return
+		existing = self.content
+		if existing and not existing.endswith('\n'):
+			existing += '\n'
+		combined = existing + normalized_new
+		self.update_content(self._normalize_csv(combined))
+

 class JsonlFile(BaseFile):
 	"""JSONL (JSON Lines) file implementation"""
--- a/examples/features/csv_file_generation.py
+++ b/examples/features/csv_file_generation.py
@@ -0,0 +1,51 @@
+"""
+Generate CSV files with automatic normalization.
+
+The agent's file system automatically normalizes CSV output using Python's csv module,
+so fields containing commas, quotes, or empty values are properly handled per RFC 4180.
+This means the agent doesn't need to worry about manual quoting — it's fixed at the
+infrastructure level.
+
+Common LLM mistakes that are auto-corrected:
+- Unquoted fields containing commas (e.g. "San Francisco, CA" without quotes)
+- Unescaped double quotes inside fields
+- Inconsistent empty field handling
+- Stray blank lines
+"""
+
+import asyncio
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+from browser_use import Agent, ChatBrowserUse
+
+
+async def main():
+	agent = Agent(
+		task=(
+			'Go to https://en.wikipedia.org/wiki/List_of_largest_cities and extract the top 10 cities. '
+			'Create a CSV file called "top_cities.csv" with columns: rank, city name, country, population. '
+			'Make sure to include all cities even if some data is missing — leave those cells empty.'
+		),
+		llm=ChatBrowserUse(model='bu-2-0'),
+	)
+
+	history = await agent.run()
+
+	# Check the generated CSV file
+	if agent.file_system:
+		csv_file = agent.file_system.get_file('top_cities.csv')
+		if csv_file:
+			print('\nGenerated CSV content:')
+			print(csv_file.content)
+			print(f'\nFile saved to: {agent.file_system.get_dir() / csv_file.full_name}')
+
+
+if __name__ == '__main__':
+	asyncio.run(main())
--- a/tests/ci/infrastructure/test_filesystem.py
+++ b/tests/ci/infrastructure/test_filesystem.py
@@ -1286,3 +1286,131 @@ class TestFileSystemIntegration:
 				assert file_obj.content == f'Content for file {i}'

 			fs.nuke()
+
+
+class TestCsvNormalization:
+	"""Test CSV normalization that fixes common LLM output mistakes."""
+
+	def test_normalize_quotes_fields_with_commas(self):
+		"""LLMs often forget to quote fields that contain commas."""
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content('name,city\n"Smith, John","San Francisco, CA"')
+		assert csv_file.content == 'name,city\n"Smith, John","San Francisco, CA"'
+
+	def test_normalize_escapes_internal_quotes(self):
+		"""Fields with double quotes inside must be escaped per RFC 4180."""
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content('name,quote\nJohn,"He said ""hello"""')
+		assert csv_file.content == 'name,quote\nJohn,"He said ""hello"""'
+
+	def test_normalize_handles_empty_fields(self):
+		"""Empty fields between commas should be preserved."""
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content('a,b,c\n1,,3\n,,\n4,5,')
+		assert csv_file.content == 'a,b,c\n1,,3\n,,\n4,5,'
+
+	def test_normalize_strips_blank_lines(self):
+		"""Leading/trailing blank lines should be stripped."""
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content('\n\na,b\n1,2\n\n')
+		assert csv_file.content == 'a,b\n1,2'
+
+	def test_normalize_preserves_valid_csv(self):
+		"""Already-valid CSV should pass through unchanged."""
+		valid = 'name,age,city\nJohn,30,New York\nJane,25,London'
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content(valid)
+		assert csv_file.content == valid
+
+	def test_normalize_empty_content(self):
+		"""Empty or whitespace-only content should pass through."""
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content('')
+		assert csv_file.content == ''
+		csv_file.write_file_content('   \n  ')
+		assert csv_file.content == '   \n  '
+
+	def test_normalize_on_append(self):
+		"""Appending rows should produce normalized combined output."""
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content('name,city\nJohn,Boston')
+		csv_file.append_file_content('\n"Jane","New York, NY"')
+		assert csv_file.content == 'name,city\nJohn,Boston\nJane,"New York, NY"'
+
+	def test_normalize_append_with_leading_newlines(self):
+		"""LLMs often prefix appended content with newlines."""
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content('a,b\n1,2')
+		csv_file.append_file_content('\n\n3,4')
+		assert csv_file.content == 'a,b\n1,2\n3,4'
+
+	async def test_normalize_through_filesystem_write(self):
+		"""CSV normalization works through the FileSystem.write_file path."""
+		with tempfile.TemporaryDirectory() as tmp_dir:
+			fs = FileSystem(base_dir=tmp_dir, create_default_files=False)
+
+			await fs.write_file('data.csv', 'name,address\nJohn,"123 Main St, Apt 4"')
+			file_obj = fs.get_file('data.csv')
+			assert isinstance(file_obj, CsvFile)
+			assert file_obj.content == 'name,address\nJohn,"123 Main St, Apt 4"'
+
+			disk_content = (fs.data_dir / 'data.csv').read_text()
+			assert disk_content == 'name,address\nJohn,"123 Main St, Apt 4"'
+
+			fs.nuke()
+
+	async def test_normalize_through_filesystem_append(self):
+		"""CSV normalization works through the FileSystem.append_file path."""
+		with tempfile.TemporaryDirectory() as tmp_dir:
+			fs = FileSystem(base_dir=tmp_dir, create_default_files=False)
+
+			await fs.write_file('data.csv', 'name,score\nAlice,95')
+			await fs.append_file('data.csv', '\n"Bob, Jr.",88')
+
+			file_obj = fs.get_file('data.csv')
+			assert file_obj is not None
+			assert file_obj.content == 'name,score\nAlice,95\n"Bob, Jr.",88'
+
+			fs.nuke()
+
+	def test_normalize_single_column(self):
+		"""Single-column CSV should work correctly."""
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content('names\nAlice\nBob\nCharlie')
+		assert csv_file.content == 'names\nAlice\nBob\nCharlie'
+
+	def test_normalize_quoted_newlines_in_fields(self):
+		"""Fields with embedded newlines should be properly quoted."""
+		csv_file = CsvFile(name='test')
+		csv_file.write_file_content('name,bio\nJohn,"Line 1\nLine 2"')
+		assert 'Line 1\nLine 2' in csv_file.content
+		assert csv_file.content == 'name,bio\nJohn,"Line 1\nLine 2"'
+
+	def test_normalize_double_escaped_newlines(self):
+		"""LLM tool calls often produce literal \\n instead of real newlines."""
+		csv_file = CsvFile(name='test')
+		# Simulate double-escaped content: literal \n and \" (as they arrive from LLM)
+		csv_file.write_file_content('name,city\\n1,Jakarta\\n2,Dhaka')
+		assert csv_file.content == 'name,city\n1,Jakarta\n2,Dhaka'
+
+	def test_normalize_double_escaped_quotes_and_newlines(self):
+		"""The exact failure mode from the bug: literal \\n and \\" in population values."""
+		csv_file = CsvFile(name='test')
+		# This is what the LLM actually sends: literal \n for row breaks,
+		# literal \" around fields with commas
+		content = 'rank,city,country,population\\n1,Jakarta,Indonesia,\\"41,913,860\\"\\n2,Dhaka,Bangladesh,\\"36,585,479\\"'
+		csv_file.write_file_content(content)
+		# Should unescape and produce proper CSV
+		lines = csv_file.content.split('\n')
+		assert len(lines) == 3
+		assert lines[0] == 'rank,city,country,population'
+		assert lines[1] == '1,Jakarta,Indonesia,"41,913,860"'
+		assert lines[2] == '2,Dhaka,Bangladesh,"36,585,479"'
+
+	def test_normalize_does_not_unescape_when_real_newlines_exist(self):
+		"""If content has real newlines, don't touch literal \\n inside field values."""
+		csv_file = CsvFile(name='test')
+		# Content with real newlines AND a field that legitimately contains \n chars
+		csv_file.write_file_content('path,desc\n/tmp/a\\nb,test file')
+		# Real newlines present → no unescaping, literal \n stays in the field
+		assert csv_file.content == 'path,desc\n/tmp/a\\nb,test file'