mirror of
https://github.com/browser-use/browser-use
synced 2026-04-22 17:45:09 +02:00
942 lines
30 KiB
Python
942 lines
30 KiB
Python
import asyncio
|
|
import base64
|
|
import csv
|
|
import io
|
|
import os
|
|
import re
|
|
import shutil
|
|
from abc import ABC, abstractmethod
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
UNSUPPORTED_BINARY_EXTENSIONS = {
|
|
'png',
|
|
'jpg',
|
|
'jpeg',
|
|
'gif',
|
|
'bmp',
|
|
'svg',
|
|
'webp',
|
|
'ico',
|
|
'mp3',
|
|
'mp4',
|
|
'wav',
|
|
'avi',
|
|
'mov',
|
|
'zip',
|
|
'tar',
|
|
'gz',
|
|
'rar',
|
|
'exe',
|
|
'bin',
|
|
'dll',
|
|
'so',
|
|
}
|
|
|
|
|
|
def _build_filename_error_message(file_name: str, supported_extensions: list[str]) -> str:
|
|
"""Build a specific error message explaining why the filename was rejected and how to fix it."""
|
|
base = os.path.basename(file_name)
|
|
|
|
# Check for binary/image extension
|
|
if '.' in base:
|
|
_, ext = base.rsplit('.', 1)
|
|
ext_lower = ext.lower()
|
|
if ext_lower in UNSUPPORTED_BINARY_EXTENSIONS:
|
|
return (
|
|
f"Error: Cannot write binary/image file '{base}'. "
|
|
f'The write_file tool only supports text-based files. '
|
|
f'Supported extensions: {", ".join("." + e for e in supported_extensions)}. '
|
|
f'For screenshots, the browser automatically captures them - do not try to save screenshots as files.'
|
|
)
|
|
if ext_lower not in supported_extensions:
|
|
return (
|
|
f"Error: Unsupported file extension '.{ext_lower}' in '{base}'. "
|
|
f'Supported extensions: {", ".join("." + e for e in supported_extensions)}. '
|
|
f'Please rename the file to use a supported extension.'
|
|
)
|
|
|
|
# No extension or no dot
|
|
if '.' not in base:
|
|
return (
|
|
f"Error: Filename '{base}' has no extension. "
|
|
f'Please add a supported extension: {", ".join("." + e for e in supported_extensions)}.'
|
|
)
|
|
|
|
return (
|
|
f"Error: Invalid filename '{base}'. "
|
|
f'Filenames must contain only letters, numbers, underscores, hyphens, dots, parentheses, and spaces. '
|
|
f'Supported extensions: {", ".join("." + e for e in supported_extensions)}.'
|
|
)
|
|
|
|
|
|
DEFAULT_FILE_SYSTEM_PATH = 'browseruse_agent_data'
|
|
|
|
|
|
class FileSystemError(Exception):
|
|
"""Custom exception for file system operations that should be shown to LLM"""
|
|
|
|
pass
|
|
|
|
|
|
class BaseFile(BaseModel, ABC):
|
|
"""Base class for all file types"""
|
|
|
|
name: str
|
|
content: str = ''
|
|
|
|
# --- Subclass must define this ---
|
|
@property
|
|
@abstractmethod
|
|
def extension(self) -> str:
|
|
"""File extension (e.g. 'txt', 'md')"""
|
|
pass
|
|
|
|
def write_file_content(self, content: str) -> None:
|
|
"""Update internal content (formatted)"""
|
|
self.update_content(content)
|
|
|
|
def append_file_content(self, content: str) -> None:
|
|
"""Append content to internal content"""
|
|
self.update_content(self.content + content)
|
|
|
|
# --- These are shared and implemented here ---
|
|
|
|
def update_content(self, content: str) -> None:
|
|
self.content = content
|
|
|
|
def sync_to_disk_sync(self, path: Path) -> None:
|
|
file_path = path / self.full_name
|
|
file_path.write_text(self.content)
|
|
|
|
async def sync_to_disk(self, path: Path) -> None:
|
|
file_path = path / self.full_name
|
|
with ThreadPoolExecutor() as executor:
|
|
await asyncio.get_event_loop().run_in_executor(executor, lambda: file_path.write_text(self.content))
|
|
|
|
async def write(self, content: str, path: Path) -> None:
|
|
self.write_file_content(content)
|
|
await self.sync_to_disk(path)
|
|
|
|
async def append(self, content: str, path: Path) -> None:
|
|
self.append_file_content(content)
|
|
await self.sync_to_disk(path)
|
|
|
|
def read(self) -> str:
|
|
return self.content
|
|
|
|
@property
|
|
def full_name(self) -> str:
|
|
return f'{self.name}.{self.extension}'
|
|
|
|
@property
|
|
def get_size(self) -> int:
|
|
return len(self.content)
|
|
|
|
@property
|
|
def get_line_count(self) -> int:
|
|
return len(self.content.splitlines())
|
|
|
|
|
|
class MarkdownFile(BaseFile):
|
|
"""Markdown file implementation"""
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return 'md'
|
|
|
|
|
|
class TxtFile(BaseFile):
|
|
"""Plain text file implementation"""
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return 'txt'
|
|
|
|
|
|
class JsonFile(BaseFile):
|
|
"""JSON file implementation"""
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return 'json'
|
|
|
|
|
|
class CsvFile(BaseFile):
|
|
"""CSV file implementation with automatic RFC 4180 normalization.
|
|
|
|
LLMs frequently produce malformed CSV (missing quotes around fields with commas,
|
|
inconsistent empty fields, unescaped internal quotes). This class parses the raw
|
|
content through Python's csv module on every write to guarantee well-formed output.
|
|
"""
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return 'csv'
|
|
|
|
@staticmethod
|
|
def _normalize_csv(raw: str) -> str:
|
|
"""Parse and re-serialize CSV content to fix quoting, empty fields, and escaping.
|
|
|
|
Handles common LLM mistakes: unquoted fields containing commas,
|
|
unescaped quotes inside fields, inconsistent empty fields,
|
|
trailing/leading blank lines, and double-escaped JSON output
|
|
(literal backslash-n and backslash-quote instead of real newlines/quotes).
|
|
"""
|
|
stripped = raw.strip('\n\r')
|
|
if not stripped:
|
|
return raw
|
|
|
|
# Detect double-escaped LLM tool call output: if the content has no real
|
|
# newlines but contains literal \n sequences, the entire string is likely
|
|
# double-escaped JSON. Unescape \" → " first, then \n → newline.
|
|
if '\n' not in stripped and '\\n' in stripped:
|
|
stripped = stripped.replace('\\"', '"')
|
|
stripped = stripped.replace('\\n', '\n')
|
|
|
|
reader = csv.reader(io.StringIO(stripped))
|
|
rows: list[list[str]] = []
|
|
for row in reader:
|
|
# Skip completely empty rows (artifacts of blank lines)
|
|
if row:
|
|
rows.append(row)
|
|
|
|
if not rows:
|
|
return raw
|
|
|
|
out = io.StringIO()
|
|
writer = csv.writer(out, lineterminator='\n')
|
|
writer.writerows(rows)
|
|
# Strip trailing newline so callers (write_file action) control line endings
|
|
return out.getvalue().rstrip('\n')
|
|
|
|
def write_file_content(self, content: str) -> None:
|
|
"""Normalize CSV content before storing."""
|
|
self.update_content(self._normalize_csv(content))
|
|
|
|
def append_file_content(self, content: str) -> None:
|
|
"""Normalize the appended CSV rows and merge with existing content."""
|
|
normalized_new = self._normalize_csv(content)
|
|
if not normalized_new.strip('\n\r'):
|
|
return
|
|
existing = self.content
|
|
if existing and not existing.endswith('\n'):
|
|
existing += '\n'
|
|
combined = existing + normalized_new
|
|
self.update_content(self._normalize_csv(combined))
|
|
|
|
|
|
class JsonlFile(BaseFile):
|
|
"""JSONL (JSON Lines) file implementation"""
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return 'jsonl'
|
|
|
|
|
|
class PdfFile(BaseFile):
|
|
"""PDF file implementation"""
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return 'pdf'
|
|
|
|
def sync_to_disk_sync(self, path: Path) -> None:
|
|
# Lazy import reportlab
|
|
from reportlab.lib.pagesizes import letter
|
|
from reportlab.lib.styles import getSampleStyleSheet
|
|
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer
|
|
|
|
file_path = path / self.full_name
|
|
try:
|
|
# Create PDF document
|
|
doc = SimpleDocTemplate(str(file_path), pagesize=letter)
|
|
styles = getSampleStyleSheet()
|
|
story = []
|
|
|
|
# Convert markdown content to simple text and add to PDF
|
|
# For basic implementation, we'll treat content as plain text
|
|
# This avoids the AGPL license issue while maintaining functionality
|
|
content_lines = self.content.split('\n')
|
|
|
|
for line in content_lines:
|
|
if line.strip():
|
|
# Handle basic markdown headers
|
|
if line.startswith('# '):
|
|
para = Paragraph(line[2:], styles['Title'])
|
|
elif line.startswith('## '):
|
|
para = Paragraph(line[3:], styles['Heading1'])
|
|
elif line.startswith('### '):
|
|
para = Paragraph(line[4:], styles['Heading2'])
|
|
else:
|
|
para = Paragraph(line, styles['Normal'])
|
|
story.append(para)
|
|
else:
|
|
story.append(Spacer(1, 6))
|
|
|
|
doc.build(story)
|
|
except Exception as e:
|
|
raise FileSystemError(f"Error: Could not write to file '{self.full_name}'. {str(e)}")
|
|
|
|
async def sync_to_disk(self, path: Path) -> None:
|
|
with ThreadPoolExecutor() as executor:
|
|
await asyncio.get_event_loop().run_in_executor(executor, lambda: self.sync_to_disk_sync(path))
|
|
|
|
|
|
class DocxFile(BaseFile):
|
|
"""DOCX file implementation"""
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return 'docx'
|
|
|
|
def sync_to_disk_sync(self, path: Path) -> None:
|
|
file_path = path / self.full_name
|
|
try:
|
|
from docx import Document
|
|
|
|
doc = Document()
|
|
|
|
# Convert content to DOCX paragraphs
|
|
content_lines = self.content.split('\n')
|
|
|
|
for line in content_lines:
|
|
if line.strip():
|
|
# Handle basic markdown headers
|
|
if line.startswith('# '):
|
|
doc.add_heading(line[2:], level=1)
|
|
elif line.startswith('## '):
|
|
doc.add_heading(line[3:], level=2)
|
|
elif line.startswith('### '):
|
|
doc.add_heading(line[4:], level=3)
|
|
else:
|
|
doc.add_paragraph(line)
|
|
else:
|
|
doc.add_paragraph() # Empty paragraph for spacing
|
|
|
|
doc.save(str(file_path))
|
|
except Exception as e:
|
|
raise FileSystemError(f"Error: Could not write to file '{self.full_name}'. {str(e)}")
|
|
|
|
async def sync_to_disk(self, path: Path) -> None:
|
|
with ThreadPoolExecutor() as executor:
|
|
await asyncio.get_event_loop().run_in_executor(executor, lambda: self.sync_to_disk_sync(path))
|
|
|
|
|
|
class HtmlFile(BaseFile):
|
|
"""HTML file implementation"""
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return 'html'
|
|
|
|
|
|
class XmlFile(BaseFile):
|
|
"""XML file implementation"""
|
|
|
|
@property
|
|
def extension(self) -> str:
|
|
return 'xml'
|
|
|
|
|
|
class FileSystemState(BaseModel):
|
|
"""Serializable state of the file system"""
|
|
|
|
files: dict[str, dict[str, Any]] = Field(default_factory=dict) # full filename -> file data
|
|
base_dir: str
|
|
extracted_content_count: int = 0
|
|
|
|
|
|
class FileSystem:
|
|
"""Enhanced file system with in-memory storage and multiple file type support"""
|
|
|
|
def __init__(self, base_dir: str | Path, create_default_files: bool = True):
|
|
# Handle the Path conversion before calling super().__init__
|
|
self.base_dir = Path(base_dir) if isinstance(base_dir, str) else base_dir
|
|
self.base_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create and use a dedicated subfolder for all operations
|
|
self.data_dir = self.base_dir / DEFAULT_FILE_SYSTEM_PATH
|
|
if self.data_dir.exists():
|
|
# clean the data directory
|
|
shutil.rmtree(self.data_dir)
|
|
self.data_dir.mkdir(exist_ok=True)
|
|
|
|
self._file_types: dict[str, type[BaseFile]] = {
|
|
'md': MarkdownFile,
|
|
'txt': TxtFile,
|
|
'json': JsonFile,
|
|
'jsonl': JsonlFile,
|
|
'csv': CsvFile,
|
|
'pdf': PdfFile,
|
|
'docx': DocxFile,
|
|
'html': HtmlFile,
|
|
'xml': XmlFile,
|
|
}
|
|
|
|
self.files = {}
|
|
if create_default_files:
|
|
self.default_files = ['todo.md']
|
|
self._create_default_files()
|
|
|
|
self.extracted_content_count = 0
|
|
|
|
def get_allowed_extensions(self) -> list[str]:
|
|
"""Get allowed extensions"""
|
|
return list(self._file_types.keys())
|
|
|
|
def _get_file_type_class(self, extension: str) -> type[BaseFile] | None:
|
|
"""Get the appropriate file class for an extension."""
|
|
return self._file_types.get(extension.lower(), None)
|
|
|
|
def _create_default_files(self) -> None:
|
|
"""Create default results and todo files"""
|
|
for full_filename in self.default_files:
|
|
name_without_ext, extension = self._parse_filename(full_filename)
|
|
file_class = self._get_file_type_class(extension)
|
|
if not file_class:
|
|
raise ValueError(f"Error: Invalid file extension '{extension}' for file '{full_filename}'.")
|
|
|
|
file_obj = file_class(name=name_without_ext)
|
|
self.files[full_filename] = file_obj # Use full filename as key
|
|
file_obj.sync_to_disk_sync(self.data_dir)
|
|
|
|
def _is_valid_filename(self, file_name: str) -> bool:
|
|
"""Check if filename matches the required pattern: name.extension
|
|
|
|
Allows letters, numbers, underscores, hyphens, dots, parentheses, spaces, and Chinese characters
|
|
in the name part, followed by a dot and a supported extension.
|
|
"""
|
|
extensions = '|'.join(self._file_types.keys())
|
|
# Allow dots, spaces, parens in the name part - match everything up to the last dot
|
|
pattern = rf'^[a-zA-Z0-9_\-\.\(\) \u4e00-\u9fff]+\.({extensions})$'
|
|
file_name_base = os.path.basename(file_name)
|
|
if not re.match(pattern, file_name_base):
|
|
return False
|
|
# Ensure the name part (before last dot) is non-empty
|
|
name_part = file_name_base.rsplit('.', 1)[0]
|
|
return len(name_part.strip()) > 0
|
|
|
|
@staticmethod
|
|
def sanitize_filename(file_name: str) -> str:
|
|
"""Sanitize a filename by replacing/removing invalid characters.
|
|
|
|
- Replaces spaces with hyphens
|
|
- Removes characters that are not alphanumeric, underscore, hyphen, dot, parentheses, or Chinese
|
|
- Preserves the extension
|
|
- Collapses multiple consecutive hyphens
|
|
"""
|
|
base = os.path.basename(file_name)
|
|
if '.' not in base:
|
|
return base
|
|
|
|
name_part, ext = base.rsplit('.', 1)
|
|
# Replace spaces with hyphens
|
|
name_part = name_part.replace(' ', '-')
|
|
# Remove invalid characters (keep alphanumeric, underscore, hyphen, dot, parens, Chinese)
|
|
name_part = re.sub(r'[^a-zA-Z0-9_\-\.\(\)\u4e00-\u9fff]', '', name_part)
|
|
# Collapse multiple hyphens
|
|
name_part = re.sub(r'-{2,}', '-', name_part)
|
|
# Strip leading/trailing hyphens and dots
|
|
name_part = name_part.strip('-.')
|
|
|
|
if not name_part:
|
|
name_part = 'file'
|
|
|
|
return f'{name_part}.{ext.lower()}'
|
|
|
|
def _resolve_filename(self, file_name: str) -> tuple[str, bool]:
|
|
"""Resolve a filename, attempting sanitization if the original is invalid.
|
|
|
|
Normalizes to basename first to prevent directory traversal (e.g. ../secret.md).
|
|
|
|
Returns:
|
|
(resolved_name, was_changed): The resolved filename and whether it differs from the input.
|
|
If resolution fails, returns (basename, was_changed).
|
|
"""
|
|
base_name = os.path.basename(file_name)
|
|
was_changed = base_name != file_name
|
|
|
|
if self._is_valid_filename(base_name):
|
|
return base_name, was_changed
|
|
|
|
sanitized = self.sanitize_filename(base_name)
|
|
if sanitized != base_name and self._is_valid_filename(sanitized):
|
|
return sanitized, True
|
|
|
|
return base_name, was_changed
|
|
|
|
def _parse_filename(self, filename: str) -> tuple[str, str]:
|
|
"""Parse filename into name and extension. Always check _is_valid_filename first."""
|
|
name, extension = filename.rsplit('.', 1)
|
|
return name, extension.lower()
|
|
|
|
def get_dir(self) -> Path:
|
|
"""Get the file system directory"""
|
|
return self.data_dir
|
|
|
|
def get_file(self, full_filename: str) -> BaseFile | None:
|
|
"""Get a file object by full filename, trying sanitization if the name is invalid."""
|
|
resolved, _ = self._resolve_filename(full_filename)
|
|
if not self._is_valid_filename(resolved):
|
|
return None
|
|
|
|
# Use resolved filename as key
|
|
return self.files.get(resolved)
|
|
|
|
def list_files(self) -> list[str]:
|
|
"""List all files in the system"""
|
|
return [file_obj.full_name for file_obj in self.files.values()]
|
|
|
|
def display_file(self, full_filename: str) -> str | None:
|
|
"""Display file content using file-specific display method"""
|
|
resolved, _ = self._resolve_filename(full_filename)
|
|
if not self._is_valid_filename(resolved):
|
|
return None
|
|
|
|
file_obj = self.files.get(resolved)
|
|
if not file_obj:
|
|
return None
|
|
|
|
return file_obj.read()
|
|
|
|
async def read_file_structured(self, full_filename: str, external_file: bool = False) -> dict[str, Any]:
|
|
"""Read file and return structured data including images if applicable.
|
|
|
|
Returns:
|
|
dict with keys:
|
|
- 'message': str - The message to display
|
|
- 'images': list[dict] | None - Image data if file is an image: [{"name": str, "data": base64_str}]
|
|
"""
|
|
result: dict[str, Any] = {'message': '', 'images': None}
|
|
|
|
if external_file:
|
|
try:
|
|
try:
|
|
_, extension = self._parse_filename(full_filename)
|
|
except Exception:
|
|
result['message'] = (
|
|
f'Error: Invalid filename format {full_filename}. Must be alphanumeric with a supported extension.'
|
|
)
|
|
return result
|
|
|
|
# Text-based extensions: derive from _file_types, excluding those with special readers
|
|
_special_extensions = {'docx', 'pdf', 'jpg', 'jpeg', 'png'}
|
|
text_extensions = [ext for ext in self._file_types if ext not in _special_extensions]
|
|
|
|
if extension in text_extensions:
|
|
import anyio
|
|
|
|
async with await anyio.open_file(full_filename, 'r') as f:
|
|
content = await f.read()
|
|
result['message'] = f'Read from file {full_filename}.\n<content>\n{content}\n</content>'
|
|
return result
|
|
|
|
elif extension == 'docx':
|
|
from docx import Document
|
|
|
|
doc = Document(full_filename)
|
|
content = '\n'.join([para.text for para in doc.paragraphs])
|
|
result['message'] = f'Read from file {full_filename}.\n<content>\n{content}\n</content>'
|
|
return result
|
|
|
|
elif extension == 'pdf':
|
|
import pypdf
|
|
|
|
reader = pypdf.PdfReader(full_filename)
|
|
num_pages = len(reader.pages)
|
|
MAX_CHARS = 60000 # character-based limit
|
|
|
|
# Extract text from all pages with page markers
|
|
page_texts: list[tuple[int, str]] = []
|
|
total_chars = 0
|
|
for i, page in enumerate(reader.pages, 1):
|
|
text = page.extract_text() or ''
|
|
page_texts.append((i, text))
|
|
total_chars += len(text)
|
|
|
|
# If small enough, return everything
|
|
if total_chars <= MAX_CHARS:
|
|
content_parts = []
|
|
for page_num, text in page_texts:
|
|
if text.strip():
|
|
content_parts.append(f'--- Page {page_num} ---\n{text}')
|
|
extracted_text = '\n\n'.join(content_parts)
|
|
result['message'] = (
|
|
f'Read from file {full_filename} ({num_pages} pages, {total_chars:,} chars).\n'
|
|
f'<content>\n{extracted_text}\n</content>'
|
|
)
|
|
return result
|
|
|
|
# Large PDF - use search to prioritize pages with distinctive content
|
|
import math
|
|
import re
|
|
|
|
# Extract words from each page and count which pages they appear on
|
|
word_to_pages: dict[str, set[int]] = {}
|
|
page_words: dict[int, set[str]] = {}
|
|
|
|
for page_num, text in page_texts:
|
|
# Extract words (lowercase, 4+ chars to filter noise)
|
|
words = set(re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()))
|
|
page_words[page_num] = words
|
|
for word in words:
|
|
if word not in word_to_pages:
|
|
word_to_pages[word] = set()
|
|
word_to_pages[word].add(page_num)
|
|
|
|
# Score pages using inverse document frequency (IDF)
|
|
# words appearing on fewer pages get higher weight
|
|
page_scores: dict[int, float] = {}
|
|
for page_num, words in page_words.items():
|
|
score = 0.0
|
|
for word in words:
|
|
pages_with_word = len(word_to_pages[word])
|
|
# IDF: log(total_pages / pages_with_word) - higher for rarer words
|
|
score += math.log(num_pages / pages_with_word)
|
|
page_scores[page_num] = score
|
|
|
|
# Sort pages by score (highest first), always include page 1
|
|
sorted_pages = sorted(page_scores.items(), key=lambda x: -x[1])
|
|
priority_pages = [1]
|
|
for page_num, _ in sorted_pages:
|
|
if page_num not in priority_pages:
|
|
priority_pages.append(page_num)
|
|
|
|
# Add remaining pages in order (for pages with no distinctive content)
|
|
for page_num, _ in page_texts:
|
|
if page_num not in priority_pages:
|
|
priority_pages.append(page_num)
|
|
|
|
# Build content from prioritized pages, respecting char limit
|
|
content_parts = []
|
|
chars_used = 0
|
|
pages_included = []
|
|
|
|
# First pass: add pages in priority order
|
|
for page_num in priority_pages:
|
|
text = page_texts[page_num - 1][1]
|
|
if not text.strip():
|
|
continue
|
|
page_header = f'--- Page {page_num} ---\n'
|
|
truncation_suffix = '\n[...truncated]'
|
|
remaining = MAX_CHARS - chars_used
|
|
# Need room for header + suffix + at least some content
|
|
min_useful = len(page_header) + len(truncation_suffix) + 50
|
|
if remaining < min_useful:
|
|
break # no room left for meaningful content
|
|
page_content = page_header + text
|
|
if len(page_content) > remaining:
|
|
# Truncate page to fit remaining budget exactly
|
|
page_content = page_content[: remaining - len(truncation_suffix)] + truncation_suffix
|
|
content_parts.append((page_num, page_content))
|
|
chars_used += len(page_content)
|
|
pages_included.append(page_num)
|
|
if chars_used >= MAX_CHARS:
|
|
break
|
|
|
|
# Sort included pages by page number for readability
|
|
content_parts.sort(key=lambda x: x[0])
|
|
extracted_text = '\n\n'.join(part for _, part in content_parts)
|
|
|
|
pages_not_shown = num_pages - len(pages_included)
|
|
if pages_not_shown > 0:
|
|
skipped = [p for p in range(1, num_pages + 1) if p not in pages_included]
|
|
truncation_note = (
|
|
f'\n\n[Showing {len(pages_included)} of {num_pages} pages. '
|
|
f'Skipped pages: {skipped[:10]}{"..." if len(skipped) > 10 else ""}. '
|
|
f'Use extract with start_from_char to read further into the file.]'
|
|
)
|
|
else:
|
|
truncation_note = ''
|
|
|
|
result['message'] = (
|
|
f'Read from file {full_filename} ({num_pages} pages, {total_chars:,} chars total).\n'
|
|
f'<content>\n{extracted_text}{truncation_note}\n</content>'
|
|
)
|
|
return result
|
|
|
|
elif extension in ['jpg', 'jpeg', 'png']:
|
|
import anyio
|
|
|
|
# Read image file and convert to base64
|
|
async with await anyio.open_file(full_filename, 'rb') as f:
|
|
img_data = await f.read()
|
|
|
|
base64_str = base64.b64encode(img_data).decode('utf-8')
|
|
|
|
result['message'] = f'Read image file {full_filename}.'
|
|
result['images'] = [{'name': os.path.basename(full_filename), 'data': base64_str}]
|
|
return result
|
|
|
|
else:
|
|
result['message'] = f'Error: Cannot read file {full_filename} as {extension} extension is not supported.'
|
|
return result
|
|
|
|
except FileNotFoundError:
|
|
result['message'] = f"Error: File '{full_filename}' not found."
|
|
return result
|
|
except PermissionError:
|
|
result['message'] = f"Error: Permission denied to read file '{full_filename}'."
|
|
return result
|
|
except Exception as e:
|
|
result['message'] = f"Error: Could not read file '{full_filename}'. {str(e)}"
|
|
return result
|
|
|
|
# For internal files, only non-image types are supported
|
|
resolved, was_sanitized = self._resolve_filename(full_filename)
|
|
if not self._is_valid_filename(resolved):
|
|
result['message'] = _build_filename_error_message(full_filename, self.get_allowed_extensions())
|
|
return result
|
|
|
|
file_obj = self.files.get(resolved)
|
|
if not file_obj:
|
|
if was_sanitized:
|
|
result['message'] = f"File '{resolved}' not found. (Filename was auto-corrected from '{full_filename}')"
|
|
else:
|
|
result['message'] = f"File '{full_filename}' not found."
|
|
return result
|
|
|
|
try:
|
|
content = file_obj.read()
|
|
sanitize_note = f"Note: filename was auto-corrected from '{full_filename}' to '{resolved}'. " if was_sanitized else ''
|
|
result['message'] = f'{sanitize_note}Read from file {resolved}.\n<content>\n{content}\n</content>'
|
|
return result
|
|
except FileSystemError as e:
|
|
result['message'] = str(e)
|
|
return result
|
|
except Exception as e:
|
|
result['message'] = f"Error: Could not read file '{full_filename}'. {str(e)}"
|
|
return result
|
|
|
|
async def read_file(self, full_filename: str, external_file: bool = False) -> str:
|
|
"""Read file content using file-specific read method and return appropriate message to LLM.
|
|
|
|
Note: For image files, use read_file_structured() to get image data.
|
|
"""
|
|
result = await self.read_file_structured(full_filename, external_file)
|
|
return result['message']
|
|
|
|
async def write_file(self, full_filename: str, content: str) -> str:
|
|
"""Write content to file using file-specific write method"""
|
|
original_filename = full_filename
|
|
resolved, was_sanitized = self._resolve_filename(full_filename)
|
|
if not self._is_valid_filename(resolved):
|
|
return _build_filename_error_message(full_filename, self.get_allowed_extensions())
|
|
full_filename = resolved
|
|
|
|
try:
|
|
name_without_ext, extension = self._parse_filename(full_filename)
|
|
file_class = self._get_file_type_class(extension)
|
|
if not file_class:
|
|
raise ValueError(f"Error: Invalid file extension '{extension}' for file '{full_filename}'.")
|
|
|
|
# Create or get existing file using full filename as key
|
|
if full_filename in self.files:
|
|
file_obj = self.files[full_filename]
|
|
else:
|
|
file_obj = file_class(name=name_without_ext)
|
|
self.files[full_filename] = file_obj # Use full filename as key
|
|
|
|
# Use file-specific write method
|
|
await file_obj.write(content, self.data_dir)
|
|
sanitize_note = f" (auto-corrected from '{original_filename}')" if was_sanitized else ''
|
|
return f'Data written to file {full_filename} successfully.{sanitize_note}'
|
|
except FileSystemError as e:
|
|
return str(e)
|
|
except Exception as e:
|
|
return f"Error: Could not write to file '{full_filename}'. {str(e)}"
|
|
|
|
async def append_file(self, full_filename: str, content: str) -> str:
|
|
"""Append content to file using file-specific append method"""
|
|
original_filename = full_filename
|
|
resolved, was_sanitized = self._resolve_filename(full_filename)
|
|
if not self._is_valid_filename(resolved):
|
|
return _build_filename_error_message(full_filename, self.get_allowed_extensions())
|
|
full_filename = resolved
|
|
|
|
file_obj = self.files.get(full_filename)
|
|
if not file_obj:
|
|
if was_sanitized:
|
|
return f"File '{full_filename}' not found. (Filename was auto-corrected from '{original_filename}')"
|
|
return f"File '{full_filename}' not found."
|
|
|
|
try:
|
|
await file_obj.append(content, self.data_dir)
|
|
sanitize_note = f" (auto-corrected from '{original_filename}')" if was_sanitized else ''
|
|
return f'Data appended to file {full_filename} successfully.{sanitize_note}'
|
|
except FileSystemError as e:
|
|
return str(e)
|
|
except Exception as e:
|
|
return f"Error: Could not append to file '{full_filename}'. {str(e)}"
|
|
|
|
async def replace_file_str(self, full_filename: str, old_str: str, new_str: str) -> str:
|
|
"""Replace old_str with new_str in file_name"""
|
|
original_filename = full_filename
|
|
resolved, was_sanitized = self._resolve_filename(full_filename)
|
|
if not self._is_valid_filename(resolved):
|
|
return _build_filename_error_message(full_filename, self.get_allowed_extensions())
|
|
full_filename = resolved
|
|
|
|
if not old_str:
|
|
return 'Error: Cannot replace empty string. Please provide a non-empty string to replace.'
|
|
|
|
file_obj = self.files.get(full_filename)
|
|
if not file_obj:
|
|
if was_sanitized:
|
|
return f"File '{full_filename}' not found. (Filename was auto-corrected from '{original_filename}')"
|
|
return f"File '{full_filename}' not found."
|
|
|
|
try:
|
|
content = file_obj.read()
|
|
content = content.replace(old_str, new_str)
|
|
await file_obj.write(content, self.data_dir)
|
|
sanitize_note = f" (auto-corrected from '{original_filename}')" if was_sanitized else ''
|
|
return f'Successfully replaced all occurrences of "{old_str}" with "{new_str}" in file {full_filename}{sanitize_note}'
|
|
except FileSystemError as e:
|
|
return str(e)
|
|
except Exception as e:
|
|
return f"Error: Could not replace string in file '{full_filename}'. {str(e)}"
|
|
|
|
async def save_extracted_content(self, content: str) -> str:
|
|
"""Save extracted content to a numbered file"""
|
|
initial_filename = f'extracted_content_{self.extracted_content_count}'
|
|
extracted_filename = f'{initial_filename}.md'
|
|
file_obj = MarkdownFile(name=initial_filename)
|
|
await file_obj.write(content, self.data_dir)
|
|
self.files[extracted_filename] = file_obj
|
|
self.extracted_content_count += 1
|
|
return extracted_filename
|
|
|
|
def describe(self) -> str:
|
|
"""List all files with their content information using file-specific display methods"""
|
|
DISPLAY_CHARS = 400
|
|
description = ''
|
|
|
|
for file_obj in self.files.values():
|
|
# Skip todo.md from description
|
|
if file_obj.full_name == 'todo.md':
|
|
continue
|
|
|
|
content = file_obj.read()
|
|
|
|
# Handle empty files
|
|
if not content:
|
|
description += f'<file>\n{file_obj.full_name} - [empty file]\n</file>\n'
|
|
continue
|
|
|
|
lines = content.splitlines()
|
|
line_count = len(lines)
|
|
|
|
# For small files, display the entire content
|
|
whole_file_description = (
|
|
f'<file>\n{file_obj.full_name} - {line_count} lines\n<content>\n{content}\n</content>\n</file>\n'
|
|
)
|
|
if len(content) < int(1.5 * DISPLAY_CHARS):
|
|
description += whole_file_description
|
|
continue
|
|
|
|
# For larger files, display start and end previews
|
|
half_display_chars = DISPLAY_CHARS // 2
|
|
|
|
# Get start preview
|
|
start_preview = ''
|
|
start_line_count = 0
|
|
chars_count = 0
|
|
for line in lines:
|
|
if chars_count + len(line) + 1 > half_display_chars:
|
|
break
|
|
start_preview += line + '\n'
|
|
chars_count += len(line) + 1
|
|
start_line_count += 1
|
|
|
|
# Get end preview
|
|
end_preview = ''
|
|
end_line_count = 0
|
|
chars_count = 0
|
|
for line in reversed(lines):
|
|
if chars_count + len(line) + 1 > half_display_chars:
|
|
break
|
|
end_preview = line + '\n' + end_preview
|
|
chars_count += len(line) + 1
|
|
end_line_count += 1
|
|
|
|
# Calculate lines in between
|
|
middle_line_count = line_count - start_line_count - end_line_count
|
|
if middle_line_count <= 0:
|
|
description += whole_file_description
|
|
continue
|
|
|
|
start_preview = start_preview.strip('\n').rstrip()
|
|
end_preview = end_preview.strip('\n').rstrip()
|
|
|
|
# Format output
|
|
if not (start_preview or end_preview):
|
|
description += f'<file>\n{file_obj.full_name} - {line_count} lines\n<content>\n{middle_line_count} lines...\n</content>\n</file>\n'
|
|
else:
|
|
description += f'<file>\n{file_obj.full_name} - {line_count} lines\n<content>\n{start_preview}\n'
|
|
description += f'... {middle_line_count} more lines ...\n'
|
|
description += f'{end_preview}\n'
|
|
description += '</content>\n</file>\n'
|
|
|
|
return description.strip('\n')
|
|
|
|
def get_todo_contents(self) -> str:
|
|
"""Get todo file contents"""
|
|
todo_file = self.get_file('todo.md')
|
|
return todo_file.read() if todo_file else ''
|
|
|
|
def get_state(self) -> FileSystemState:
|
|
"""Get serializable state of the file system"""
|
|
files_data = {}
|
|
for full_filename, file_obj in self.files.items():
|
|
files_data[full_filename] = {'type': file_obj.__class__.__name__, 'data': file_obj.model_dump()}
|
|
|
|
return FileSystemState(
|
|
files=files_data, base_dir=str(self.base_dir), extracted_content_count=self.extracted_content_count
|
|
)
|
|
|
|
def nuke(self) -> None:
|
|
"""Delete the file system directory"""
|
|
shutil.rmtree(self.data_dir)
|
|
|
|
@classmethod
|
|
def from_state(cls, state: FileSystemState) -> 'FileSystem':
|
|
"""Restore file system from serializable state at the exact same location"""
|
|
# Create file system without default files
|
|
fs = cls(base_dir=Path(state.base_dir), create_default_files=False)
|
|
fs.extracted_content_count = state.extracted_content_count
|
|
|
|
# Restore all files
|
|
for full_filename, file_data in state.files.items():
|
|
file_type = file_data['type']
|
|
file_info = file_data['data']
|
|
|
|
# Create the appropriate file object based on type
|
|
file_type_map: dict[str, type[BaseFile]] = {
|
|
'MarkdownFile': MarkdownFile,
|
|
'TxtFile': TxtFile,
|
|
'JsonFile': JsonFile,
|
|
'JsonlFile': JsonlFile,
|
|
'CsvFile': CsvFile,
|
|
'PdfFile': PdfFile,
|
|
'DocxFile': DocxFile,
|
|
'HtmlFile': HtmlFile,
|
|
'XmlFile': XmlFile,
|
|
}
|
|
|
|
file_class = file_type_map.get(file_type)
|
|
if not file_class:
|
|
# Skip unknown file types
|
|
continue
|
|
file_obj = file_class(**file_info)
|
|
|
|
# Add to files dict and sync to disk
|
|
fs.files[full_filename] = file_obj
|
|
file_obj.sync_to_disk_sync(fs.data_dir)
|
|
|
|
return fs
|