open-webui/backend/open_webui/routers/files.py

613 lines
18 KiB
Python
Raw Normal View History

2024-08-28 06:10:27 +08:00
import logging
2024-06-19 02:36:55 +08:00
import os
import uuid
2025-04-08 12:42:37 +08:00
from fnmatch import fnmatch
2024-08-28 06:10:27 +08:00
from pathlib import Path
from typing import Optional
2024-12-16 15:08:51 +08:00
from urllib.parse import quote
2024-10-05 08:22:00 +08:00
2025-03-30 06:23:02 +08:00
from fastapi import (
APIRouter,
Depends,
File,
HTTPException,
Request,
UploadFile,
status,
Query,
)
2025-02-07 07:32:06 +08:00
from fastapi.responses import FileResponse, StreamingResponse
from open_webui.constants import ERROR_MESSAGES
from open_webui.env import SRC_LOG_LEVELS
from open_webui.models.users import Users
2024-12-10 16:54:13 +08:00
from open_webui.models.files import (
2024-10-21 14:38:26 +08:00
FileForm,
FileModel,
FileModelResponse,
Files,
)
2025-03-31 16:10:18 +08:00
from open_webui.models.knowledge import Knowledges
from open_webui.routers.knowledge import get_knowledge, get_knowledge_list
2025-02-07 07:32:06 +08:00
from open_webui.routers.retrieval import ProcessFileForm, process_file
2025-02-27 05:09:52 +08:00
from open_webui.routers.audio import transcribe
2025-02-07 07:32:06 +08:00
from open_webui.storage.provider import Storage
2024-12-09 08:01:56 +08:00
from open_webui.utils.auth import get_admin_user, get_verified_user
2025-02-07 07:32:06 +08:00
from pydantic import BaseModel
2024-06-19 02:36:55 +08:00
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["MODELS"])
router = APIRouter()
2025-03-31 16:10:18 +08:00
############################
# Check if the current user has access to a file through any knowledge bases the user may be in.
############################
2025-03-31 16:10:18 +08:00
def has_access_to_file(
file_id: Optional[str], access_type: str, user=Depends(get_verified_user)
) -> bool:
file = Files.get_file_by_id(file_id)
log.debug(f"Checking if user has {access_type} access to file")
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2025-03-31 16:10:18 +08:00
has_access = False
knowledge_base_id = file.meta.get("collection_name") if file.meta else None
2025-03-31 16:10:18 +08:00
if knowledge_base_id:
2025-03-31 16:10:18 +08:00
knowledge_bases = Knowledges.get_knowledge_bases_by_user_id(
user.id, access_type
)
for knowledge_base in knowledge_bases:
if knowledge_base.id == knowledge_base_id:
has_access = True
break
return has_access
2024-06-19 02:36:55 +08:00
############################
# Upload File
############################
2024-10-27 03:56:37 +08:00
@router.post("/", response_model=FileModelResponse)
2024-12-13 14:32:28 +08:00
def upload_file(
2025-02-07 07:32:06 +08:00
request: Request,
file: UploadFile = File(...),
user=Depends(get_verified_user),
metadata: dict = None,
2025-03-30 06:23:02 +08:00
process: bool = Query(True),
2024-12-13 14:32:28 +08:00
):
2024-06-19 02:36:55 +08:00
log.info(f"file.content_type: {file.content_type}")
2025-04-28 21:11:28 +08:00
file_metadata = metadata if metadata else {}
2024-06-19 02:36:55 +08:00
try:
unsanitized_filename = file.filename
filename = os.path.basename(unsanitized_filename)
2025-05-15 03:06:33 +08:00
file_extension = os.path.splitext(filename)[1]
# Remove the leading dot from the file extension
file_extension = file_extension[1:] if file_extension else ""
if not file_metadata and request.app.state.config.ALLOWED_FILE_EXTENSIONS:
2025-05-17 01:05:52 +08:00
request.app.state.config.ALLOWED_FILE_EXTENSIONS = [
ext for ext in request.app.state.config.ALLOWED_FILE_EXTENSIONS if ext
]
2025-05-15 03:06:33 +08:00
if file_extension not in request.app.state.config.ALLOWED_FILE_EXTENSIONS:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(
f"File type {file_extension} is not allowed"
),
)
2024-06-19 02:36:55 +08:00
# replace filename with uuid
id = str(uuid.uuid4())
2024-07-15 19:05:38 +08:00
name = filename
2024-06-19 05:15:08 +08:00
filename = f"{id}_{filename}"
2025-04-28 12:56:56 +08:00
tags = {
"OpenWebUI-User-Email": user.email,
"OpenWebUI-User-Id": user.id,
"OpenWebUI-User-Name": user.name,
"OpenWebUI-File-Id": id,
}
contents, file_path = Storage.upload_file(file.file, filename, tags)
2024-06-19 02:36:55 +08:00
2024-10-21 14:45:15 +08:00
file_item = Files.insert_new_file(
2024-06-19 04:50:18 +08:00
user.id,
FileForm(
**{
"id": id,
2024-11-22 11:46:09 +08:00
"filename": name,
2024-10-21 08:45:37 +08:00
"path": file_path,
2024-06-19 04:50:18 +08:00
"meta": {
2024-07-15 19:05:38 +08:00
"name": name,
2024-06-19 04:50:18 +08:00
"content_type": file.content_type,
"size": len(contents),
"data": file_metadata,
2024-06-19 04:50:18 +08:00
},
}
),
2024-06-19 02:36:55 +08:00
)
2025-03-30 06:23:02 +08:00
if process:
try:
2025-05-15 02:00:42 +08:00
if file.content_type:
2025-05-19 06:52:48 +08:00
if file.content_type.startswith("audio/") or file.content_type in {
"video/webm"
}:
2025-05-15 02:00:42 +08:00
file_path = Storage.get_file(file_path)
result = transcribe(request, file_path)
process_file(
request,
ProcessFileForm(file_id=id, content=result.get("text", "")),
user=user,
)
elif file.content_type not in [
"image/png",
"image/jpeg",
"image/gif",
"video/mp4",
"video/ogg",
"video/quicktime",
]:
process_file(request, ProcessFileForm(file_id=id), user=user)
2025-05-15 01:59:17 +08:00
else:
log.info(
2025-05-15 02:00:42 +08:00
f"File type {file.content_type} is not provided, but trying to process anyway"
2025-05-15 01:59:17 +08:00
)
process_file(request, ProcessFileForm(file_id=id), user=user)
2025-04-07 08:31:50 +08:00
file_item = Files.get_file_by_id(id=id)
except Exception as e:
log.exception(e)
log.error(f"Error processing file: {file_item.id}")
file_item = FileModelResponse(
**{
**file_item.model_dump(),
"error": str(e.detail) if hasattr(e, "detail") else str(e),
}
2025-02-27 05:09:52 +08:00
)
2024-10-04 13:22:22 +08:00
2024-10-21 14:45:15 +08:00
if file_item:
return file_item
2024-06-19 02:36:55 +08:00
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT("Error uploading file"),
)
except Exception as e:
log.exception(e)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
)
############################
# List Files
############################
2024-10-21 14:38:26 +08:00
@router.get("/", response_model=list[FileModelResponse])
2025-04-07 06:14:57 +08:00
async def list_files(user=Depends(get_verified_user), content: bool = Query(True)):
2024-08-23 22:19:04 +08:00
if user.role == "admin":
files = Files.get_files()
else:
files = Files.get_files_by_user_id(user.id)
2025-04-07 06:14:57 +08:00
if not content:
for file in files:
if "content" in file.data:
del file.data["content"]
2025-04-07 06:14:57 +08:00
2024-06-19 02:36:55 +08:00
return files
2025-04-08 12:42:37 +08:00
############################
# Search Files
############################
@router.get("/search", response_model=list[FileModelResponse])
async def search_files(
2025-04-08 12:48:54 +08:00
filename: str = Query(
...,
2025-04-08 12:49:55 +08:00
description="Filename pattern to search for. Supports wildcards such as '*.txt'",
2025-04-08 12:48:54 +08:00
),
2025-04-08 13:00:06 +08:00
content: bool = Query(True),
2025-04-08 12:49:55 +08:00
user=Depends(get_verified_user),
2025-04-08 12:42:37 +08:00
):
2025-04-08 12:56:21 +08:00
"""
Search for files by filename with support for wildcard patterns.
"""
# Get files according to user role
if user.role == "admin":
files = Files.get_files()
else:
files = Files.get_files_by_user_id(user.id)
2025-04-08 12:42:37 +08:00
2025-04-08 12:44:47 +08:00
# Get matching files
2025-04-08 12:48:54 +08:00
matching_files = [
file for file in files if fnmatch(file.filename.lower(), filename.lower())
]
2025-04-08 12:49:55 +08:00
2025-04-08 12:42:37 +08:00
if not matching_files:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
2025-04-08 12:48:54 +08:00
detail="No files found matching the pattern.",
2025-04-08 12:42:37 +08:00
)
2025-04-08 13:00:06 +08:00
if not content:
for file in matching_files:
if "content" in file.data:
del file.data["content"]
2025-04-08 13:00:06 +08:00
2025-04-08 12:42:37 +08:00
return matching_files
2024-06-19 06:20:04 +08:00
############################
# Delete All Files
############################
@router.delete("/all")
async def delete_all_files(user=Depends(get_admin_user)):
result = Files.delete_all_files()
2024-06-19 06:20:04 +08:00
if result:
try:
2024-10-21 14:38:26 +08:00
Storage.delete_all_files()
2024-06-19 06:20:04 +08:00
except Exception as e:
2024-10-21 14:38:26 +08:00
log.exception(e)
2025-02-07 07:32:06 +08:00
log.error("Error deleting files")
2024-10-21 14:38:26 +08:00
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
)
2024-06-19 06:20:04 +08:00
return {"message": "All files deleted successfully"}
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
)
2024-06-19 02:36:55 +08:00
############################
# Get File By Id
############################
@router.get("/{id}", response_model=Optional[FileModel])
2024-08-22 22:08:03 +08:00
async def get_file_by_id(id: str, user=Depends(get_verified_user)):
file = Files.get_file_by_id(id)
2024-06-19 02:36:55 +08:00
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2025-03-31 16:10:18 +08:00
if (
file.user_id == user.id
or user.role == "admin"
or has_access_to_file(id, "read", user)
):
2024-06-19 02:36:55 +08:00
return file
else:
raise HTTPException(
2024-06-21 04:49:04 +08:00
status_code=status.HTTP_404_NOT_FOUND,
2024-06-19 02:36:55 +08:00
detail=ERROR_MESSAGES.NOT_FOUND,
)
2024-10-04 15:23:14 +08:00
############################
# Get File Data Content By Id
############################
@router.get("/{id}/data/content")
async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user)):
file = Files.get_file_by_id(id)
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2025-03-31 16:10:18 +08:00
if (
file.user_id == user.id
or user.role == "admin"
or has_access_to_file(id, "read", user)
):
2024-10-04 15:23:14 +08:00
return {"content": file.data.get("content", "")}
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
############################
# Update File Data Content By Id
############################
class ContentForm(BaseModel):
content: str
@router.post("/{id}/data/content/update")
async def update_file_data_content_by_id(
2024-12-13 14:32:28 +08:00
request: Request, id: str, form_data: ContentForm, user=Depends(get_verified_user)
2024-10-04 15:23:14 +08:00
):
file = Files.get_file_by_id(id)
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2025-03-31 16:10:18 +08:00
if (
file.user_id == user.id
or user.role == "admin"
or has_access_to_file(id, "write", user)
):
2024-10-04 15:23:14 +08:00
try:
2024-12-13 14:32:28 +08:00
process_file(
request,
ProcessFileForm(file_id=id, content=form_data.content),
2025-02-05 16:07:45 +08:00
user=user,
2024-12-13 14:32:28 +08:00
)
2024-10-04 15:23:14 +08:00
file = Files.get_file_by_id(id=id)
except Exception as e:
log.exception(e)
log.error(f"Error processing file: {file.id}")
return {"content": file.data.get("content", "")}
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2024-06-19 05:33:44 +08:00
############################
# Get File Content By Id
############################
@router.get("/{id}/content")
2025-03-30 06:23:02 +08:00
async def get_file_content_by_id(
id: str, user=Depends(get_verified_user), attachment: bool = Query(False)
):
file = Files.get_file_by_id(id)
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2025-03-31 16:10:18 +08:00
if (
file.user_id == user.id
or user.role == "admin"
or has_access_to_file(id, "read", user)
):
2024-10-21 14:38:26 +08:00
try:
file_path = Storage.get_file(file.path)
file_path = Path(file_path)
# Check if the file already exists in the cache
if file_path.is_file():
2024-12-16 15:08:51 +08:00
# Handle Unicode filenames
filename = file.meta.get("name", file.filename)
encoded_filename = quote(filename) # RFC5987 encoding
2024-12-19 10:15:58 +08:00
2025-02-20 15:44:11 +08:00
content_type = file.meta.get("content_type")
filename = file.meta.get("name", file.filename)
encoded_filename = quote(filename)
2024-12-19 10:15:58 +08:00
headers = {}
2025-02-20 15:44:11 +08:00
2025-03-30 06:23:02 +08:00
if attachment:
2025-02-20 15:44:11 +08:00
headers["Content-Disposition"] = (
f"attachment; filename*=UTF-8''{encoded_filename}"
)
else:
2025-03-30 06:23:02 +08:00
if content_type == "application/pdf" or filename.lower().endswith(
".pdf"
):
headers["Content-Disposition"] = (
f"inline; filename*=UTF-8''{encoded_filename}"
)
content_type = "application/pdf"
elif content_type != "text/plain":
headers["Content-Disposition"] = (
f"attachment; filename*=UTF-8''{encoded_filename}"
)
2025-02-20 15:44:11 +08:00
return FileResponse(file_path, headers=headers, media_type=content_type)
2024-12-16 15:08:51 +08:00
2024-10-21 14:38:26 +08:00
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
except Exception as e:
log.exception(e)
2025-02-07 07:32:06 +08:00
log.error("Error getting file content")
2024-10-25 06:02:26 +08:00
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
)
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
@router.get("/{id}/content/html")
async def get_html_file_content_by_id(id: str, user=Depends(get_verified_user)):
file = Files.get_file_by_id(id)
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
file_user = Users.get_user_by_id(file.user_id)
if not file_user.role == "admin":
2025-04-30 23:41:16 +08:00
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2025-03-31 16:10:18 +08:00
if (
file.user_id == user.id
or user.role == "admin"
or has_access_to_file(id, "read", user)
):
2024-10-25 06:02:26 +08:00
try:
file_path = Storage.get_file(file.path)
file_path = Path(file_path)
# Check if the file already exists in the cache
if file_path.is_file():
log.info(f"file_path: {file_path}")
2024-10-25 06:02:26 +08:00
return FileResponse(file_path)
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
except Exception as e:
log.exception(e)
2025-02-07 07:32:06 +08:00
log.error("Error getting file content")
2024-06-23 05:49:00 +08:00
raise HTTPException(
2024-10-21 14:38:26 +08:00
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
2024-06-23 05:49:00 +08:00
)
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
@router.get("/{id}/content/{file_name}")
2024-08-22 22:08:03 +08:00
async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
2024-06-23 05:49:00 +08:00
file = Files.get_file_by_id(id)
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2025-03-31 16:10:18 +08:00
if (
file.user_id == user.id
or user.role == "admin"
or has_access_to_file(id, "read", user)
):
2024-10-21 08:45:37 +08:00
file_path = file.path
2024-12-16 15:08:51 +08:00
# Handle Unicode filenames
filename = file.meta.get("name", file.filename)
encoded_filename = quote(filename) # RFC5987 encoding
headers = {
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
}
2024-10-02 21:19:09 +08:00
if file_path:
2024-10-21 14:38:26 +08:00
file_path = Storage.get_file(file_path)
2024-10-02 21:19:09 +08:00
file_path = Path(file_path)
# Check if the file already exists in the cache
if file_path.is_file():
return FileResponse(file_path, headers=headers)
2024-10-02 21:19:09 +08:00
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2024-06-19 05:33:44 +08:00
else:
2024-10-02 21:19:09 +08:00
# File path doesnt exist, return the content as .txt if possible
file_content = file.content.get("content", "")
file_name = file.filename
# Create a generator that encodes the file content
def generator():
yield file_content.encode("utf-8")
return StreamingResponse(
generator(),
media_type="text/plain",
2024-12-16 15:08:51 +08:00
headers=headers,
2024-06-19 05:33:44 +08:00
)
else:
raise HTTPException(
2024-06-21 04:49:04 +08:00
status_code=status.HTTP_404_NOT_FOUND,
2024-06-19 05:33:44 +08:00
detail=ERROR_MESSAGES.NOT_FOUND,
)
2024-06-19 02:36:55 +08:00
############################
# Delete File By Id
############################
@router.delete("/{id}")
2024-08-22 22:08:03 +08:00
async def delete_file_by_id(id: str, user=Depends(get_verified_user)):
file = Files.get_file_by_id(id)
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=ERROR_MESSAGES.NOT_FOUND,
)
2025-03-31 16:10:18 +08:00
if (
file.user_id == user.id
or user.role == "admin"
or has_access_to_file(id, "write", user)
):
2025-01-14 01:21:00 +08:00
# We should add Chroma cleanup here
result = Files.delete_file_by_id(id)
2024-06-19 02:36:55 +08:00
if result:
2024-10-21 14:38:26 +08:00
try:
2024-12-29 09:40:00 +08:00
Storage.delete_file(file.path)
2024-10-21 14:38:26 +08:00
except Exception as e:
log.exception(e)
2025-02-07 07:32:06 +08:00
log.error("Error deleting files")
2024-10-21 14:38:26 +08:00
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
)
2024-06-19 02:36:55 +08:00
return {"message": "File deleted successfully"}
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT("Error deleting file"),
)
else:
raise HTTPException(
2024-06-21 04:49:04 +08:00
status_code=status.HTTP_404_NOT_FOUND,
2024-06-19 02:36:55 +08:00
detail=ERROR_MESSAGES.NOT_FOUND,
)