improved csv file generation

This commit is contained in:
Saurav Panda
2026-02-25 11:45:05 -08:00
parent 943c93ab5d
commit 8270346ccf
3 changed files with 238 additions and 1 deletions

View File

@@ -1,5 +1,7 @@
import asyncio
import base64
import csv
import io
import os
import re
import shutil
@@ -164,12 +166,68 @@ class JsonFile(BaseFile):
class CsvFile(BaseFile):
"""CSV file implementation"""
"""CSV file implementation with automatic RFC 4180 normalization.
LLMs frequently produce malformed CSV (missing quotes around fields with commas,
inconsistent empty fields, unescaped internal quotes). This class parses the raw
content through Python's csv module on every write to guarantee well-formed output.
"""
@property
def extension(self) -> str:
return 'csv'
@staticmethod
def _normalize_csv(raw: str) -> str:
"""Parse and re-serialize CSV content to fix quoting, empty fields, and escaping.
Handles common LLM mistakes: unquoted fields containing commas,
unescaped quotes inside fields, inconsistent empty fields,
trailing/leading blank lines, and double-escaped JSON output
(literal backslash-n and backslash-quote instead of real newlines/quotes).
"""
stripped = raw.strip()
if not stripped:
return raw
# Detect double-escaped LLM tool call output: if the content has no real
# newlines but contains literal \n sequences, the entire string is likely
# double-escaped JSON. Unescape \" → " first, then \n → newline.
if '\n' not in stripped and '\\n' in stripped:
stripped = stripped.replace('\\"', '"')
stripped = stripped.replace('\\n', '\n')
reader = csv.reader(io.StringIO(stripped))
rows: list[list[str]] = []
for row in reader:
# Skip completely empty rows (artifacts of blank lines)
if row:
rows.append(row)
if not rows:
return raw
out = io.StringIO()
writer = csv.writer(out, lineterminator='\n')
writer.writerows(rows)
# Strip trailing newline so callers (write_file action) control line endings
return out.getvalue().rstrip('\n')
def write_file_content(self, content: str) -> None:
"""Normalize CSV content before storing."""
self.update_content(self._normalize_csv(content))
def append_file_content(self, content: str) -> None:
"""Normalize the appended CSV rows and merge with existing content."""
normalized_new = self._normalize_csv(content)
if not normalized_new.strip():
return
existing = self.content
if existing and not existing.endswith('\n'):
existing += '\n'
combined = existing + normalized_new
self.update_content(self._normalize_csv(combined))
class JsonlFile(BaseFile):
"""JSONL (JSON Lines) file implementation"""

View File

@@ -0,0 +1,51 @@
"""
Generate CSV files with automatic normalization.
The agent's file system automatically normalizes CSV output using Python's csv module,
so fields containing commas, quotes, or empty values are properly handled per RFC 4180.
This means the agent doesn't need to worry about manual quoting — it's fixed at the
infrastructure level.
Common LLM mistakes that are auto-corrected:
- Unquoted fields containing commas (e.g. "San Francisco, CA" without quotes)
- Unescaped double quotes inside fields
- Inconsistent empty field handling
- Stray blank lines
"""
import asyncio
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from browser_use import Agent, ChatBrowserUse
async def main():
agent = Agent(
task=(
'Go to https://en.wikipedia.org/wiki/List_of_largest_cities and extract the top 10 cities. '
'Create a CSV file called "top_cities.csv" with columns: rank, city name, country, population. '
'Make sure to include all cities even if some data is missing — leave those cells empty.'
),
llm=ChatBrowserUse(model='bu-2-0'),
)
history = await agent.run()
# Check the generated CSV file
if agent.file_system:
csv_file = agent.file_system.get_file('top_cities.csv')
if csv_file:
print('\nGenerated CSV content:')
print(csv_file.content)
print(f'\nFile saved to: {agent.file_system.get_dir() / csv_file.full_name}')
if __name__ == '__main__':
asyncio.run(main())

View File

@@ -1286,3 +1286,131 @@ class TestFileSystemIntegration:
assert file_obj.content == f'Content for file {i}'
fs.nuke()
class TestCsvNormalization:
"""Test CSV normalization that fixes common LLM output mistakes."""
def test_normalize_quotes_fields_with_commas(self):
"""LLMs often forget to quote fields that contain commas."""
csv_file = CsvFile(name='test')
csv_file.write_file_content('name,city\n"Smith, John","San Francisco, CA"')
assert csv_file.content == 'name,city\n"Smith, John","San Francisco, CA"'
def test_normalize_escapes_internal_quotes(self):
"""Fields with double quotes inside must be escaped per RFC 4180."""
csv_file = CsvFile(name='test')
csv_file.write_file_content('name,quote\nJohn,"He said ""hello"""')
assert csv_file.content == 'name,quote\nJohn,"He said ""hello"""'
def test_normalize_handles_empty_fields(self):
"""Empty fields between commas should be preserved."""
csv_file = CsvFile(name='test')
csv_file.write_file_content('a,b,c\n1,,3\n,,\n4,5,')
assert csv_file.content == 'a,b,c\n1,,3\n,,\n4,5,'
def test_normalize_strips_blank_lines(self):
"""Leading/trailing blank lines should be stripped."""
csv_file = CsvFile(name='test')
csv_file.write_file_content('\n\na,b\n1,2\n\n')
assert csv_file.content == 'a,b\n1,2'
def test_normalize_preserves_valid_csv(self):
"""Already-valid CSV should pass through unchanged."""
valid = 'name,age,city\nJohn,30,New York\nJane,25,London'
csv_file = CsvFile(name='test')
csv_file.write_file_content(valid)
assert csv_file.content == valid
def test_normalize_empty_content(self):
"""Empty or whitespace-only content should pass through."""
csv_file = CsvFile(name='test')
csv_file.write_file_content('')
assert csv_file.content == ''
csv_file.write_file_content(' \n ')
assert csv_file.content == ' \n '
def test_normalize_on_append(self):
"""Appending rows should produce normalized combined output."""
csv_file = CsvFile(name='test')
csv_file.write_file_content('name,city\nJohn,Boston')
csv_file.append_file_content('\n"Jane","New York, NY"')
assert csv_file.content == 'name,city\nJohn,Boston\nJane,"New York, NY"'
def test_normalize_append_with_leading_newlines(self):
"""LLMs often prefix appended content with newlines."""
csv_file = CsvFile(name='test')
csv_file.write_file_content('a,b\n1,2')
csv_file.append_file_content('\n\n3,4')
assert csv_file.content == 'a,b\n1,2\n3,4'
async def test_normalize_through_filesystem_write(self):
"""CSV normalization works through the FileSystem.write_file path."""
with tempfile.TemporaryDirectory() as tmp_dir:
fs = FileSystem(base_dir=tmp_dir, create_default_files=False)
await fs.write_file('data.csv', 'name,address\nJohn,"123 Main St, Apt 4"')
file_obj = fs.get_file('data.csv')
assert isinstance(file_obj, CsvFile)
assert file_obj.content == 'name,address\nJohn,"123 Main St, Apt 4"'
disk_content = (fs.data_dir / 'data.csv').read_text()
assert disk_content == 'name,address\nJohn,"123 Main St, Apt 4"'
fs.nuke()
async def test_normalize_through_filesystem_append(self):
"""CSV normalization works through the FileSystem.append_file path."""
with tempfile.TemporaryDirectory() as tmp_dir:
fs = FileSystem(base_dir=tmp_dir, create_default_files=False)
await fs.write_file('data.csv', 'name,score\nAlice,95')
await fs.append_file('data.csv', '\n"Bob, Jr.",88')
file_obj = fs.get_file('data.csv')
assert file_obj is not None
assert file_obj.content == 'name,score\nAlice,95\n"Bob, Jr.",88'
fs.nuke()
def test_normalize_single_column(self):
"""Single-column CSV should work correctly."""
csv_file = CsvFile(name='test')
csv_file.write_file_content('names\nAlice\nBob\nCharlie')
assert csv_file.content == 'names\nAlice\nBob\nCharlie'
def test_normalize_quoted_newlines_in_fields(self):
"""Fields with embedded newlines should be properly quoted."""
csv_file = CsvFile(name='test')
csv_file.write_file_content('name,bio\nJohn,"Line 1\nLine 2"')
assert 'Line 1\nLine 2' in csv_file.content
assert csv_file.content == 'name,bio\nJohn,"Line 1\nLine 2"'
def test_normalize_double_escaped_newlines(self):
"""LLM tool calls often produce literal \\n instead of real newlines."""
csv_file = CsvFile(name='test')
# Simulate double-escaped content: literal \n and \" (as they arrive from LLM)
csv_file.write_file_content('name,city\\n1,Jakarta\\n2,Dhaka')
assert csv_file.content == 'name,city\n1,Jakarta\n2,Dhaka'
def test_normalize_double_escaped_quotes_and_newlines(self):
"""The exact failure mode from the bug: literal \\n and \\" in population values."""
csv_file = CsvFile(name='test')
# This is what the LLM actually sends: literal \n for row breaks,
# literal \" around fields with commas
content = 'rank,city,country,population\\n1,Jakarta,Indonesia,\\"41,913,860\\"\\n2,Dhaka,Bangladesh,\\"36,585,479\\"'
csv_file.write_file_content(content)
# Should unescape and produce proper CSV
lines = csv_file.content.split('\n')
assert len(lines) == 3
assert lines[0] == 'rank,city,country,population'
assert lines[1] == '1,Jakarta,Indonesia,"41,913,860"'
assert lines[2] == '2,Dhaka,Bangladesh,"36,585,479"'
def test_normalize_does_not_unescape_when_real_newlines_exist(self):
"""If content has real newlines, don't touch literal \\n inside field values."""
csv_file = CsvFile(name='test')
# Content with real newlines AND a field that legitimately contains \n chars
csv_file.write_file_content('path,desc\n/tmp/a\\nb,test file')
# Real newlines present → no unescaping, literal \n stays in the field
assert csv_file.content == 'path,desc\n/tmp/a\\nb,test file'