mirror of
https://github.com/open-webui/open-webui.git
synced 2026-04-26 01:25:34 +02:00
796 lines
26 KiB
Python
796 lines
26 KiB
Python
import logging
|
||
import os
|
||
import uuid
|
||
import json
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
from urllib.parse import quote
|
||
import asyncio
|
||
|
||
from fastapi import (
|
||
BackgroundTasks,
|
||
APIRouter,
|
||
Depends,
|
||
File,
|
||
Form,
|
||
HTTPException,
|
||
Request,
|
||
UploadFile,
|
||
status,
|
||
Query,
|
||
)
|
||
|
||
from fastapi.responses import FileResponse, StreamingResponse
|
||
from sqlalchemy.orm import Session
|
||
from open_webui.internal.db import get_session, SessionLocal
|
||
|
||
from open_webui.constants import ERROR_MESSAGES
|
||
from open_webui.retrieval.vector.factory import VECTOR_DB_CLIENT
|
||
|
||
from open_webui.models.channels import Channels
|
||
from open_webui.models.users import Users
|
||
from open_webui.models.files import (
|
||
FileForm,
|
||
FileListResponse,
|
||
FileModel,
|
||
FileModelResponse,
|
||
Files,
|
||
)
|
||
from open_webui.models.chats import Chats
|
||
from open_webui.models.knowledge import Knowledges
|
||
from open_webui.models.groups import Groups
|
||
from open_webui.models.access_grants import AccessGrants
|
||
|
||
|
||
from open_webui.routers.retrieval import ProcessFileForm, process_file
|
||
from open_webui.routers.audio import transcribe
|
||
|
||
from open_webui.storage.provider import Storage
|
||
|
||
|
||
from open_webui.config import BYPASS_ADMIN_ACCESS_CONTROL
|
||
from open_webui.utils.auth import get_admin_user, get_verified_user
|
||
from open_webui.utils.misc import strict_match_mime_type
|
||
from pydantic import BaseModel
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
router = APIRouter()
|
||
|
||
|
||
from open_webui.utils.access_control.files import has_access_to_file
|
||
|
||
############################
|
||
# Upload File
|
||
############################
|
||
|
||
|
||
def _is_text_file(file_path: str, chunk_size: int = 8192) -> bool:
|
||
"""Check if a file is likely a text file by reading a chunk and validating UTF-8.
|
||
|
||
This catches files whose extensions are mis-mapped by mimetypes/browsers
|
||
(e.g. TypeScript .ts → video/mp2t) without maintaining an extension whitelist.
|
||
"""
|
||
try:
|
||
resolved = Storage.get_file(file_path)
|
||
with open(resolved, 'rb') as f:
|
||
chunk = f.read(chunk_size)
|
||
if not chunk:
|
||
return False
|
||
# Null bytes are a strong indicator of binary content
|
||
if b'\x00' in chunk:
|
||
return False
|
||
chunk.decode('utf-8')
|
||
return True
|
||
except (UnicodeDecodeError, Exception):
|
||
return False
|
||
|
||
|
||
def process_uploaded_file(
|
||
request,
|
||
file,
|
||
file_path,
|
||
file_item,
|
||
file_metadata,
|
||
user,
|
||
db: Optional[Session] = None,
|
||
):
|
||
def _process_handler(db_session):
|
||
try:
|
||
content_type = file.content_type
|
||
|
||
# Detect mis-labeled text files (e.g. .ts → video/mp2t)
|
||
if content_type and content_type.startswith(('image/', 'video/')):
|
||
if _is_text_file(file_path):
|
||
content_type = 'text/plain'
|
||
|
||
if content_type:
|
||
stt_supported_content_types = getattr(request.app.state.config, 'STT_SUPPORTED_CONTENT_TYPES', [])
|
||
|
||
if strict_match_mime_type(stt_supported_content_types, content_type):
|
||
file_path_processed = Storage.get_file(file_path)
|
||
result = transcribe(request, file_path_processed, file_metadata, user)
|
||
|
||
process_file(
|
||
request,
|
||
ProcessFileForm(file_id=file_item.id, content=result.get('text', '')),
|
||
user=user,
|
||
db=db_session,
|
||
)
|
||
elif (not content_type.startswith(('image/', 'video/'))) or (
|
||
request.app.state.config.CONTENT_EXTRACTION_ENGINE == 'external'
|
||
):
|
||
process_file(
|
||
request,
|
||
ProcessFileForm(file_id=file_item.id),
|
||
user=user,
|
||
db=db_session,
|
||
)
|
||
else:
|
||
raise Exception(f'File type {content_type} is not supported for processing')
|
||
else:
|
||
log.info(f'File type {file.content_type} is not provided, but trying to process anyway')
|
||
process_file(
|
||
request,
|
||
ProcessFileForm(file_id=file_item.id),
|
||
user=user,
|
||
db=db_session,
|
||
)
|
||
|
||
except Exception as e:
|
||
log.error(f'Error processing file: {file_item.id}')
|
||
Files.update_file_data_by_id(
|
||
file_item.id,
|
||
{
|
||
'status': 'failed',
|
||
'error': str(e.detail) if hasattr(e, 'detail') else str(e),
|
||
},
|
||
db=db_session,
|
||
)
|
||
|
||
if db:
|
||
_process_handler(db)
|
||
else:
|
||
with SessionLocal() as db_session:
|
||
_process_handler(db_session)
|
||
|
||
|
||
@router.post('/', response_model=FileModelResponse)
|
||
def upload_file(
|
||
request: Request,
|
||
background_tasks: BackgroundTasks,
|
||
file: UploadFile = File(...),
|
||
metadata: Optional[dict | str] = Form(None),
|
||
process: bool = Query(True),
|
||
process_in_background: bool = Query(True),
|
||
user=Depends(get_verified_user),
|
||
db: Session = Depends(get_session),
|
||
):
|
||
return upload_file_handler(
|
||
request,
|
||
file=file,
|
||
metadata=metadata,
|
||
process=process,
|
||
process_in_background=process_in_background,
|
||
user=user,
|
||
background_tasks=background_tasks,
|
||
db=db,
|
||
)
|
||
|
||
|
||
def upload_file_handler(
|
||
request: Request,
|
||
file: UploadFile = File(...),
|
||
metadata: Optional[dict | str] = Form(None),
|
||
process: bool = Query(True),
|
||
process_in_background: bool = Query(True),
|
||
user=Depends(get_verified_user),
|
||
background_tasks: Optional[BackgroundTasks] = None,
|
||
db: Optional[Session] = None,
|
||
):
|
||
log.info(f'file.content_type: {file.content_type} {process}')
|
||
|
||
if isinstance(metadata, str):
|
||
try:
|
||
metadata = json.loads(metadata)
|
||
except json.JSONDecodeError:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT('Invalid metadata format'),
|
||
)
|
||
file_metadata = metadata if metadata else {}
|
||
|
||
try:
|
||
unsanitized_filename = file.filename
|
||
filename = os.path.basename(unsanitized_filename)
|
||
|
||
file_extension = os.path.splitext(filename)[1]
|
||
# Remove the leading dot from the file extension
|
||
file_extension = file_extension[1:] if file_extension else ''
|
||
|
||
if process and request.app.state.config.ALLOWED_FILE_EXTENSIONS:
|
||
request.app.state.config.ALLOWED_FILE_EXTENSIONS = [
|
||
ext for ext in request.app.state.config.ALLOWED_FILE_EXTENSIONS if ext
|
||
]
|
||
|
||
if file_extension not in request.app.state.config.ALLOWED_FILE_EXTENSIONS:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT(f'File type {file_extension} is not allowed'),
|
||
)
|
||
|
||
# replace filename with uuid
|
||
id = str(uuid.uuid4())
|
||
name = filename
|
||
filename = f'{id}_{filename}'
|
||
contents, file_path = Storage.upload_file(
|
||
file.file,
|
||
filename,
|
||
{
|
||
'OpenWebUI-User-Email': user.email,
|
||
'OpenWebUI-User-Id': user.id,
|
||
'OpenWebUI-User-Name': user.name,
|
||
'OpenWebUI-File-Id': id,
|
||
},
|
||
)
|
||
|
||
file_item = Files.insert_new_file(
|
||
user.id,
|
||
FileForm(
|
||
**{
|
||
'id': id,
|
||
'filename': name,
|
||
'path': file_path,
|
||
'data': {
|
||
**({'status': 'pending'} if process else {}),
|
||
},
|
||
'meta': {
|
||
'name': name,
|
||
'content_type': (file.content_type if isinstance(file.content_type, str) else None),
|
||
'size': len(contents),
|
||
'data': file_metadata,
|
||
},
|
||
}
|
||
),
|
||
db=db,
|
||
)
|
||
|
||
if 'channel_id' in file_metadata:
|
||
channel = Channels.get_channel_by_id_and_user_id(file_metadata['channel_id'], user.id, db=db)
|
||
if channel:
|
||
Channels.add_file_to_channel_by_id(channel.id, file_item.id, user.id, db=db)
|
||
|
||
if process:
|
||
if background_tasks and process_in_background:
|
||
background_tasks.add_task(
|
||
process_uploaded_file,
|
||
request,
|
||
file,
|
||
file_path,
|
||
file_item,
|
||
file_metadata,
|
||
user,
|
||
)
|
||
return {'status': True, **file_item.model_dump()}
|
||
else:
|
||
process_uploaded_file(
|
||
request,
|
||
file,
|
||
file_path,
|
||
file_item,
|
||
file_metadata,
|
||
user,
|
||
db=db,
|
||
)
|
||
return {'status': True, **file_item.model_dump()}
|
||
else:
|
||
if file_item:
|
||
return file_item
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT('Error uploading file'),
|
||
)
|
||
|
||
except HTTPException as e:
|
||
raise e
|
||
except Exception as e:
|
||
log.exception(e)
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT('Error uploading file'),
|
||
)
|
||
|
||
|
||
############################
|
||
# List Files
|
||
############################
|
||
|
||
|
||
PAGE_SIZE = 50
|
||
|
||
|
||
@router.get('/', response_model=FileListResponse)
|
||
async def list_files(
|
||
user=Depends(get_verified_user),
|
||
page: int = Query(1, ge=1, description='Page number (1-indexed)'),
|
||
content: bool = Query(True),
|
||
db: Session = Depends(get_session),
|
||
):
|
||
skip = (page - 1) * PAGE_SIZE
|
||
user_id = None if (user.role == 'admin' and BYPASS_ADMIN_ACCESS_CONTROL) else user.id
|
||
|
||
result = Files.get_file_list(user_id=user_id, skip=skip, limit=PAGE_SIZE, db=db)
|
||
|
||
if not content:
|
||
for file in result.items:
|
||
if file.data and 'content' in file.data:
|
||
del file.data['content']
|
||
|
||
return result
|
||
|
||
|
||
############################
|
||
# Search Files
|
||
############################
|
||
|
||
|
||
@router.get('/search', response_model=list[FileModelResponse])
|
||
async def search_files(
|
||
filename: str = Query(
|
||
...,
|
||
description="Filename pattern to search for. Supports wildcards such as '*.txt'",
|
||
),
|
||
content: bool = Query(True),
|
||
skip: int = Query(0, ge=0, description='Number of files to skip'),
|
||
limit: int = Query(100, ge=1, le=1000, description='Maximum number of files to return'),
|
||
user=Depends(get_verified_user),
|
||
db: Session = Depends(get_session),
|
||
):
|
||
"""
|
||
Search for files by filename with support for wildcard patterns.
|
||
Uses SQL-based filtering with pagination for better performance.
|
||
"""
|
||
# Determine user_id: null for admin with bypass (search all), user.id otherwise
|
||
user_id = None if (user.role == 'admin' and BYPASS_ADMIN_ACCESS_CONTROL) else user.id
|
||
|
||
# Use optimized database query with pagination
|
||
files = Files.search_files(
|
||
user_id=user_id,
|
||
filename=filename,
|
||
skip=skip,
|
||
limit=limit,
|
||
db=db,
|
||
)
|
||
|
||
if not files:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail='No files found matching the pattern.',
|
||
)
|
||
|
||
if not content:
|
||
for file in files:
|
||
if file.data and 'content' in file.data:
|
||
del file.data['content']
|
||
|
||
return files
|
||
|
||
|
||
############################
|
||
# Delete All Files
|
||
############################
|
||
|
||
|
||
@router.delete('/all')
|
||
async def delete_all_files(user=Depends(get_admin_user), db: Session = Depends(get_session)):
|
||
result = Files.delete_all_files(db=db)
|
||
if result:
|
||
try:
|
||
Storage.delete_all_files()
|
||
VECTOR_DB_CLIENT.reset()
|
||
except Exception as e:
|
||
log.exception(e)
|
||
log.error('Error deleting files')
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT('Error deleting files'),
|
||
)
|
||
return {'message': 'All files deleted successfully'}
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT('Error deleting files'),
|
||
)
|
||
|
||
|
||
############################
|
||
# Get File By Id
|
||
############################
|
||
|
||
|
||
@router.get('/{id}', response_model=Optional[FileModel])
|
||
async def get_file_by_id(id: str, user=Depends(get_verified_user), db: Session = Depends(get_session)):
|
||
file = Files.get_file_by_id(id, db=db)
|
||
|
||
if not file:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
if file.user_id == user.id or user.role == 'admin' or has_access_to_file(id, 'read', user, db=db):
|
||
return file
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
|
||
@router.get('/{id}/process/status')
|
||
async def get_file_process_status(
|
||
id: str,
|
||
stream: bool = Query(False),
|
||
user=Depends(get_verified_user),
|
||
db: Session = Depends(get_session),
|
||
):
|
||
file = Files.get_file_by_id(id, db=db)
|
||
|
||
if not file:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
if file.user_id == user.id or user.role == 'admin' or has_access_to_file(id, 'read', user, db=db):
|
||
if stream:
|
||
MAX_FILE_PROCESSING_DURATION = 3600 * 2
|
||
|
||
async def event_stream(file_id):
|
||
# NOTE: We intentionally do NOT capture the request's db session here.
|
||
# Each poll creates its own short-lived session to avoid holding a
|
||
# connection for hours. A WebSocket push would be more efficient.
|
||
for _ in range(MAX_FILE_PROCESSING_DURATION):
|
||
file_item = Files.get_file_by_id(file_id) # Creates own session
|
||
if file_item:
|
||
data = file_item.model_dump().get('data', {})
|
||
status = data.get('status')
|
||
|
||
if status:
|
||
event = {'status': status}
|
||
if status == 'failed':
|
||
event['error'] = data.get('error')
|
||
|
||
yield f'data: {json.dumps(event)}\n\n'
|
||
if status in ('completed', 'failed'):
|
||
break
|
||
else:
|
||
# Legacy
|
||
break
|
||
else:
|
||
yield f'data: {json.dumps({"status": "not_found"})}\n\n'
|
||
break
|
||
|
||
await asyncio.sleep(1)
|
||
|
||
return StreamingResponse(
|
||
event_stream(file.id),
|
||
media_type='text/event-stream',
|
||
)
|
||
else:
|
||
return {'status': file.data.get('status', 'pending')}
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
|
||
############################
|
||
# Get File Data Content By Id
|
||
############################
|
||
|
||
|
||
@router.get('/{id}/data/content')
|
||
async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user), db: Session = Depends(get_session)):
|
||
file = Files.get_file_by_id(id, db=db)
|
||
|
||
if not file:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
if file.user_id == user.id or user.role == 'admin' or has_access_to_file(id, 'read', user, db=db):
|
||
return {'content': file.data.get('content', '')}
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
|
||
############################
|
||
# Update File Data Content By Id
|
||
############################
|
||
|
||
|
||
class ContentForm(BaseModel):
|
||
content: str
|
||
|
||
|
||
@router.post('/{id}/data/content/update')
|
||
def update_file_data_content_by_id(
|
||
request: Request,
|
||
id: str,
|
||
form_data: ContentForm,
|
||
user=Depends(get_verified_user),
|
||
db: Session = Depends(get_session),
|
||
):
|
||
file = Files.get_file_by_id(id, db=db)
|
||
|
||
if not file:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
if file.user_id == user.id or user.role == 'admin' or has_access_to_file(id, 'write', user, db=db):
|
||
try:
|
||
process_file(
|
||
request,
|
||
ProcessFileForm(file_id=id, content=form_data.content),
|
||
user=user,
|
||
db=db,
|
||
)
|
||
file = Files.get_file_by_id(id=id, db=db)
|
||
except Exception as e:
|
||
log.exception(e)
|
||
log.error(f'Error processing file: {file.id}')
|
||
|
||
# Propagate content change to all knowledge collections referencing
|
||
# this file. Without this the old embeddings remain in the knowledge
|
||
# collection and RAG returns both stale and current data (#20558).
|
||
knowledges = Knowledges.get_knowledges_by_file_id(id, db=db)
|
||
for knowledge in knowledges:
|
||
try:
|
||
# Remove old embeddings for this file from the KB collection
|
||
VECTOR_DB_CLIENT.delete(collection_name=knowledge.id, filter={'file_id': id})
|
||
# Re-add from the now-updated file-{file_id} collection
|
||
process_file(
|
||
request,
|
||
ProcessFileForm(file_id=id, collection_name=knowledge.id),
|
||
user=user,
|
||
db=db,
|
||
)
|
||
except Exception as e:
|
||
log.warning(f'Failed to update knowledge {knowledge.id} after content change for file {id}: {e}')
|
||
|
||
return {'content': file.data.get('content', '')}
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
|
||
############################
|
||
# Get File Content By Id
|
||
############################
|
||
|
||
|
||
@router.get('/{id}/content')
|
||
async def get_file_content_by_id(
|
||
id: str,
|
||
user=Depends(get_verified_user),
|
||
attachment: bool = Query(False),
|
||
db: Session = Depends(get_session),
|
||
):
|
||
file = Files.get_file_by_id(id, db=db)
|
||
|
||
if not file:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
if file.user_id == user.id or user.role == 'admin' or has_access_to_file(id, 'read', user, db=db):
|
||
try:
|
||
file_path = Storage.get_file(file.path)
|
||
file_path = Path(file_path)
|
||
|
||
# Check if the file already exists in the cache
|
||
if file_path.is_file():
|
||
# Handle Unicode filenames
|
||
filename = file.meta.get('name', file.filename)
|
||
encoded_filename = quote(filename) # RFC5987 encoding
|
||
|
||
content_type = file.meta.get('content_type')
|
||
filename = file.meta.get('name', file.filename)
|
||
encoded_filename = quote(filename)
|
||
headers = {}
|
||
|
||
if attachment:
|
||
headers['Content-Disposition'] = f"attachment; filename*=UTF-8''{encoded_filename}"
|
||
else:
|
||
if content_type == 'application/pdf' or filename.lower().endswith('.pdf'):
|
||
headers['Content-Disposition'] = f"inline; filename*=UTF-8''{encoded_filename}"
|
||
content_type = 'application/pdf'
|
||
elif content_type != 'text/plain':
|
||
headers['Content-Disposition'] = f"attachment; filename*=UTF-8''{encoded_filename}"
|
||
|
||
return FileResponse(file_path, headers=headers, media_type=content_type)
|
||
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
except HTTPException as e:
|
||
raise e
|
||
except Exception as e:
|
||
log.exception(e)
|
||
log.error('Error getting file content')
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT('Error getting file content'),
|
||
)
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
|
||
@router.get('/{id}/content/html')
|
||
async def get_html_file_content_by_id(id: str, user=Depends(get_verified_user), db: Session = Depends(get_session)):
|
||
file = Files.get_file_by_id(id, db=db)
|
||
|
||
if not file:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
file_user = Users.get_user_by_id(file.user_id, db=db)
|
||
if not file_user.role == 'admin':
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
if file.user_id == user.id or user.role == 'admin' or has_access_to_file(id, 'read', user, db=db):
|
||
try:
|
||
file_path = Storage.get_file(file.path)
|
||
file_path = Path(file_path)
|
||
|
||
# Check if the file already exists in the cache
|
||
if file_path.is_file():
|
||
log.info(f'file_path: {file_path}')
|
||
return FileResponse(file_path)
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
except HTTPException as e:
|
||
raise e
|
||
except Exception as e:
|
||
log.exception(e)
|
||
log.error('Error getting file content')
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT('Error getting file content'),
|
||
)
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
|
||
@router.get('/{id}/content/{file_name}')
|
||
async def get_file_content_by_id(id: str, user=Depends(get_verified_user), db: Session = Depends(get_session)):
|
||
file = Files.get_file_by_id(id, db=db)
|
||
|
||
if not file:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
if file.user_id == user.id or user.role == 'admin' or has_access_to_file(id, 'read', user, db=db):
|
||
file_path = file.path
|
||
|
||
# Handle Unicode filenames
|
||
filename = file.meta.get('name', file.filename)
|
||
encoded_filename = quote(filename) # RFC5987 encoding
|
||
headers = {'Content-Disposition': f"attachment; filename*=UTF-8''{encoded_filename}"}
|
||
|
||
if file_path:
|
||
file_path = Storage.get_file(file_path)
|
||
file_path = Path(file_path)
|
||
|
||
# Check if the file already exists in the cache
|
||
if file_path.is_file():
|
||
return FileResponse(file_path, headers=headers)
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
else:
|
||
# File path doesn’t exist, return the content as .txt if possible
|
||
file_content = file.content.get('content', '')
|
||
file_name = file.filename
|
||
|
||
# Create a generator that encodes the file content
|
||
def generator():
|
||
yield file_content.encode('utf-8')
|
||
|
||
return StreamingResponse(
|
||
generator(),
|
||
media_type='text/plain',
|
||
headers=headers,
|
||
)
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
|
||
############################
|
||
# Delete File By Id
|
||
############################
|
||
|
||
|
||
@router.delete('/{id}')
|
||
async def delete_file_by_id(id: str, user=Depends(get_verified_user), db: Session = Depends(get_session)):
|
||
file = Files.get_file_by_id(id, db=db)
|
||
|
||
if not file:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|
||
|
||
if file.user_id == user.id or user.role == 'admin' or has_access_to_file(id, 'write', user, db=db):
|
||
# Clean up KB associations and embeddings before deleting
|
||
knowledges = Knowledges.get_knowledges_by_file_id(id, db=db)
|
||
for knowledge in knowledges:
|
||
# Remove KB-file relationship
|
||
Knowledges.remove_file_from_knowledge_by_id(knowledge.id, id, db=db)
|
||
# Clean KB embeddings (same logic as /knowledge/{id}/file/remove)
|
||
try:
|
||
VECTOR_DB_CLIENT.delete(collection_name=knowledge.id, filter={'file_id': id})
|
||
if file.hash:
|
||
VECTOR_DB_CLIENT.delete(collection_name=knowledge.id, filter={'hash': file.hash})
|
||
except Exception as e:
|
||
log.debug(f'KB embedding cleanup for {knowledge.id}: {e}')
|
||
|
||
result = Files.delete_file_by_id(id, db=db)
|
||
if result:
|
||
try:
|
||
Storage.delete_file(file.path)
|
||
VECTOR_DB_CLIENT.delete(collection_name=f'file-{id}')
|
||
except Exception as e:
|
||
log.exception(e)
|
||
log.error('Error deleting files')
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT('Error deleting files'),
|
||
)
|
||
return {'message': 'File deleted successfully'}
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_400_BAD_REQUEST,
|
||
detail=ERROR_MESSAGES.DEFAULT('Error deleting file'),
|
||
)
|
||
else:
|
||
raise HTTPException(
|
||
status_code=status.HTTP_404_NOT_FOUND,
|
||
detail=ERROR_MESSAGES.NOT_FOUND,
|
||
)
|