fix: consolidate psql cleanup logic and fix web add with cleanup (#20072)

* sequential

* consolidate logic and fix for web add

* Update WebSearch.svelte

* Update retrieval.py

* Update retrieval.py

* Update WebSearch.svelte
This commit is contained in:
Classic298
2025-12-21 13:14:29 +01:00
committed by GitHub
parent 5077676d33
commit 48ccb1e170
3 changed files with 33 additions and 15 deletions

View File

@@ -373,6 +373,32 @@ def sanitize_filename(file_name):
return final_file_name
def sanitize_text_for_db(text: str) -> str:
"""Remove null bytes and invalid UTF-8 surrogates from text for PostgreSQL storage."""
if not isinstance(text, str):
return text
# Remove null bytes - PostgreSQL cannot store \x00 in text fields
text = text.replace("\x00", "")
# Remove invalid UTF-8 surrogate characters that can cause encoding errors
# This handles cases where binary data or encoding issues introduced surrogates
try:
text = text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")
except (UnicodeEncodeError, UnicodeDecodeError):
pass
return text
def sanitize_data_for_db(obj):
"""Recursively sanitize all strings in a data structure for database storage."""
if isinstance(obj, str):
return sanitize_text_for_db(obj)
elif isinstance(obj, dict):
return {k: sanitize_data_for_db(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [sanitize_data_for_db(v) for v in obj]
return obj
def extract_folders_after_data_docs(path):
# Convert the path to a Path object if it's not already
path = Path(path)