| 
									
										
										
										
											2024-08-28 06:10:27 +08:00
										 |  |  | import json | 
					
						
							|  |  |  | import logging | 
					
						
							|  |  |  | import mimetypes | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import shutil | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-28 06:10:27 +08:00
										 |  |  | import uuid | 
					
						
							| 
									
										
										
										
											2024-06-08 12:18:04 +08:00
										 |  |  | from datetime import datetime | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | from pathlib import Path | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  | from typing import Iterator, List, Optional, Sequence, Union | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | from fastapi import ( | 
					
						
							|  |  |  |     Depends, | 
					
						
							|  |  |  |     FastAPI, | 
					
						
							|  |  |  |     File, | 
					
						
							|  |  |  |     Form, | 
					
						
							|  |  |  |     HTTPException, | 
					
						
							|  |  |  |     UploadFile, | 
					
						
							|  |  |  |     Request, | 
					
						
							|  |  |  |     status, | 
					
						
							|  |  |  |     APIRouter, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-09-10 09:27:50 +08:00
										 |  |  | from fastapi.middleware.cors import CORSMiddleware | 
					
						
							| 
									
										
										
										
											2025-02-14 15:05:10 +08:00
										 |  |  | from fastapi.concurrency import run_in_threadpool | 
					
						
							| 
									
										
										
										
											2024-09-10 09:27:50 +08:00
										 |  |  | from pydantic import BaseModel | 
					
						
							| 
									
										
										
										
											2024-10-26 12:46:14 +08:00
										 |  |  | import tiktoken | 
					
						
							| 
									
										
										
										
											2024-09-10 09:27:50 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-13 18:02:02 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter | 
					
						
							|  |  |  | from langchain_core.documents import Document | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  | from open_webui.models.files import FileModel, Files | 
					
						
							| 
									
										
										
										
											2024-12-10 16:54:13 +08:00
										 |  |  | from open_webui.models.knowledge import Knowledges | 
					
						
							| 
									
										
										
										
											2024-10-21 14:45:15 +08:00
										 |  |  | from open_webui.storage.provider import Storage | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from open_webui.retrieval.vector.connector import VECTOR_DB_CLIENT | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Document loaders | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | from open_webui.retrieval.loaders.main import Loader | 
					
						
							|  |  |  | from open_webui.retrieval.loaders.youtube import YoutubeLoader | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Web search engines | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | from open_webui.retrieval.web.main import SearchResult | 
					
						
							|  |  |  | from open_webui.retrieval.web.utils import get_web_loader | 
					
						
							|  |  |  | from open_webui.retrieval.web.brave import search_brave | 
					
						
							|  |  |  | from open_webui.retrieval.web.kagi import search_kagi | 
					
						
							|  |  |  | from open_webui.retrieval.web.mojeek import search_mojeek | 
					
						
							| 
									
										
										
										
											2025-02-10 16:44:47 +08:00
										 |  |  | from open_webui.retrieval.web.bocha import search_bocha | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | from open_webui.retrieval.web.duckduckgo import search_duckduckgo | 
					
						
							|  |  |  | from open_webui.retrieval.web.google_pse import search_google_pse | 
					
						
							|  |  |  | from open_webui.retrieval.web.jina_search import search_jina | 
					
						
							|  |  |  | from open_webui.retrieval.web.searchapi import search_searchapi | 
					
						
							| 
									
										
										
										
											2025-02-14 12:24:58 +08:00
										 |  |  | from open_webui.retrieval.web.serpapi import search_serpapi | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | from open_webui.retrieval.web.searxng import search_searxng | 
					
						
							|  |  |  | from open_webui.retrieval.web.serper import search_serper | 
					
						
							|  |  |  | from open_webui.retrieval.web.serply import search_serply | 
					
						
							|  |  |  | from open_webui.retrieval.web.serpstack import search_serpstack | 
					
						
							|  |  |  | from open_webui.retrieval.web.tavily import search_tavily | 
					
						
							|  |  |  | from open_webui.retrieval.web.bing import search_bing | 
					
						
							| 
									
										
										
										
											2025-02-05 02:13:05 +08:00
										 |  |  | from open_webui.retrieval.web.exa import search_exa | 
					
						
							| 
									
										
										
										
											2025-02-27 16:12:41 +08:00
										 |  |  | from open_webui.retrieval.web.perplexity import search_perplexity | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | from open_webui.retrieval.utils import ( | 
					
						
							| 
									
										
										
										
											2024-08-28 06:10:27 +08:00
										 |  |  |     get_embedding_function, | 
					
						
							|  |  |  |     get_model_path, | 
					
						
							|  |  |  |     query_collection, | 
					
						
							|  |  |  |     query_collection_with_hybrid_search, | 
					
						
							|  |  |  |     query_doc, | 
					
						
							|  |  |  |     query_doc_with_hybrid_search, | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | from open_webui.utils.misc import ( | 
					
						
							|  |  |  |     calculate_sha256_string, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | from open_webui.utils.auth import get_admin_user, get_verified_user | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-04 22:54:48 +08:00
										 |  |  | from open_webui.config import ( | 
					
						
							| 
									
										
										
										
											2024-08-28 06:10:27 +08:00
										 |  |  |     ENV, | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     RAG_EMBEDDING_MODEL_AUTO_UPDATE, | 
					
						
							| 
									
										
										
										
											2024-04-23 02:27:43 +08:00
										 |  |  |     RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE, | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     RAG_RERANKING_MODEL_AUTO_UPDATE, | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  |     RAG_RERANKING_MODEL_TRUST_REMOTE_CODE, | 
					
						
							| 
									
										
										
										
											2024-08-28 06:10:27 +08:00
										 |  |  |     UPLOAD_DIR, | 
					
						
							| 
									
										
										
										
											2024-10-28 17:33:52 +08:00
										 |  |  |     DEFAULT_LOCALE, | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-11-03 17:08:04 +08:00
										 |  |  | from open_webui.env import ( | 
					
						
							|  |  |  |     SRC_LOG_LEVELS, | 
					
						
							|  |  |  |     DEVICE_TYPE, | 
					
						
							|  |  |  |     DOCKER, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | from open_webui.constants import ERROR_MESSAGES | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  | log = logging.getLogger(__name__) | 
					
						
							|  |  |  | log.setLevel(SRC_LOG_LEVELS["RAG"]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | ########################################## | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Utility functions | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ########################################## | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  | def get_ef( | 
					
						
							|  |  |  |     engine: str, | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     embedding_model: str, | 
					
						
							| 
									
										
										
										
											2024-09-18 04:58:06 +08:00
										 |  |  |     auto_update: bool = False, | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |     ef = None | 
					
						
							|  |  |  |     if embedding_model and engine == "": | 
					
						
							| 
									
										
										
										
											2024-10-13 15:21:06 +08:00
										 |  |  |         from sentence_transformers import SentenceTransformer | 
					
						
							| 
									
										
										
										
											2024-07-01 08:13:56 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-03 17:08:04 +08:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |             ef = SentenceTransformer( | 
					
						
							| 
									
										
										
										
											2024-11-03 17:08:04 +08:00
										 |  |  |                 get_model_path(embedding_model, auto_update), | 
					
						
							|  |  |  |                 device=DEVICE_TYPE, | 
					
						
							|  |  |  |                 trust_remote_code=RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         except Exception as e: | 
					
						
							|  |  |  |             log.debug(f"Error loading SentenceTransformer: {e}") | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |     return ef | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | def get_rf( | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     reranking_model: str, | 
					
						
							| 
									
										
										
										
											2024-09-18 04:58:06 +08:00
										 |  |  |     auto_update: bool = False, | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |     rf = None | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     if reranking_model: | 
					
						
							| 
									
										
										
										
											2024-09-16 18:36:43 +08:00
										 |  |  |         if any(model in reranking_model for model in ["jinaai/jina-colbert-v2"]): | 
					
						
							| 
									
										
										
										
											2024-09-18 05:07:04 +08:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 from open_webui.retrieval.models.colbert import ColBERT | 
					
						
							| 
									
										
										
										
											2024-10-03 05:18:42 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |                 rf = ColBERT( | 
					
						
							| 
									
										
										
										
											2024-09-30 05:20:37 +08:00
										 |  |  |                     get_model_path(reranking_model, auto_update), | 
					
						
							|  |  |  |                     env="docker" if DOCKER else None, | 
					
						
							| 
									
										
										
										
											2024-09-20 00:40:23 +08:00
										 |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-18 05:13:51 +08:00
										 |  |  |             except Exception as e: | 
					
						
							|  |  |  |                 log.error(f"ColBERT: {e}") | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |                 raise Exception(ERROR_MESSAGES.DEFAULT(e)) | 
					
						
							| 
									
										
										
										
											2024-09-16 17:46:39 +08:00
										 |  |  |         else: | 
					
						
							|  |  |  |             import sentence_transformers | 
					
						
							| 
									
										
										
										
											2024-07-01 08:13:56 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-16 17:46:39 +08:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |                 rf = sentence_transformers.CrossEncoder( | 
					
						
							| 
									
										
										
										
											2024-09-18 04:58:06 +08:00
										 |  |  |                     get_model_path(reranking_model, auto_update), | 
					
						
							| 
									
										
										
										
											2024-09-16 17:46:39 +08:00
										 |  |  |                     device=DEVICE_TYPE, | 
					
						
							|  |  |  |                     trust_remote_code=RAG_RERANKING_MODEL_TRUST_REMOTE_CODE, | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |             except: | 
					
						
							|  |  |  |                 log.error("CrossEncoder error") | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |                 raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error")) | 
					
						
							|  |  |  |     return rf | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | ########################################## | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # API routes | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | ########################################## | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | router = APIRouter() | 
					
						
							| 
									
										
										
										
											2024-01-07 14:07:20 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 15:40:51 +08:00
										 |  |  | class CollectionNameForm(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-09-28 08:29:08 +08:00
										 |  |  |     collection_name: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:29:08 +08:00
										 |  |  | class ProcessUrlForm(CollectionNameForm): | 
					
						
							| 
									
										
										
										
											2024-01-07 15:40:51 +08:00
										 |  |  |     url: str | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  | class SearchForm(CollectionNameForm): | 
					
						
							|  |  |  |     query: str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.get("/") | 
					
						
							|  |  |  | async def get_status(request: Request): | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         "chunk_size": request.app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |         "chunk_overlap": request.app.state.config.CHUNK_OVERLAP, | 
					
						
							|  |  |  |         "template": request.app.state.config.RAG_TEMPLATE, | 
					
						
							|  |  |  |         "embedding_engine": request.app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |         "embedding_model": request.app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							|  |  |  |         "reranking_model": request.app.state.config.RAG_RERANKING_MODEL, | 
					
						
							|  |  |  |         "embedding_batch_size": request.app.state.config.RAG_EMBEDDING_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.get("/embedding") | 
					
						
							|  |  |  | async def get_embedding_config(request: Request, user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         "embedding_engine": request.app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |         "embedding_model": request.app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							|  |  |  |         "embedding_batch_size": request.app.state.config.RAG_EMBEDDING_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |         "openai_config": { | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |             "url": request.app.state.config.RAG_OPENAI_API_BASE_URL, | 
					
						
							|  |  |  |             "key": request.app.state.config.RAG_OPENAI_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |         "ollama_config": { | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |             "url": request.app.state.config.RAG_OLLAMA_BASE_URL, | 
					
						
							|  |  |  |             "key": request.app.state.config.RAG_OLLAMA_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.get("/reranking") | 
					
						
							|  |  |  | async def get_reraanking_config(request: Request, user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         "reranking_model": request.app.state.config.RAG_RERANKING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  | class OpenAIConfigForm(BaseModel): | 
					
						
							|  |  |  |     url: str | 
					
						
							|  |  |  |     key: str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  | class OllamaConfigForm(BaseModel): | 
					
						
							|  |  |  |     url: str | 
					
						
							|  |  |  |     key: str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  | class EmbeddingModelUpdateForm(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |     openai_config: Optional[OpenAIConfigForm] = None | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |     ollama_config: Optional[OllamaConfigForm] = None | 
					
						
							| 
									
										
										
										
											2024-04-15 06:31:40 +08:00
										 |  |  |     embedding_engine: str | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     embedding_model: str | 
					
						
							| 
									
										
										
										
											2024-09-27 06:28:47 +08:00
										 |  |  |     embedding_batch_size: Optional[int] = 1 | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/embedding/update") | 
					
						
							| 
									
										
										
										
											2024-04-15 06:31:40 +08:00
										 |  |  | async def update_embedding_config( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user) | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-04-05 02:07:42 +08:00
										 |  |  |     log.info( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}" | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-04-05 01:01:23 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine | 
					
						
							|  |  |  |         request.app.state.config.RAG_EMBEDDING_MODEL = form_data.embedding_model | 
					
						
							| 
									
										
										
										
											2024-04-15 06:31:40 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.RAG_EMBEDDING_ENGINE in ["ollama", "openai"]: | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |             if form_data.openai_config is not None: | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 request.app.state.config.RAG_OPENAI_API_BASE_URL = ( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                     form_data.openai_config.url | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 request.app.state.config.RAG_OPENAI_API_KEY = ( | 
					
						
							|  |  |  |                     form_data.openai_config.key | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |             if form_data.ollama_config is not None: | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 request.app.state.config.RAG_OLLAMA_BASE_URL = ( | 
					
						
							|  |  |  |                     form_data.ollama_config.url | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |                 request.app.state.config.RAG_OLLAMA_API_KEY = ( | 
					
						
							|  |  |  |                     form_data.ollama_config.key | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.RAG_EMBEDDING_BATCH_SIZE = ( | 
					
						
							|  |  |  |                 form_data.embedding_batch_size | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |         request.app.state.ef = get_ef( | 
					
						
							|  |  |  |             request.app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |             request.app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-04-05 01:01:23 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.EMBEDDING_FUNCTION = get_embedding_function( | 
					
						
							|  |  |  |             request.app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |             request.app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |             request.app.state.ef, | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |             ( | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 request.app.state.config.RAG_OPENAI_API_BASE_URL | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai" | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 else request.app.state.config.RAG_OLLAMA_BASE_URL | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |             ), | 
					
						
							|  |  |  |             ( | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 request.app.state.config.RAG_OPENAI_API_KEY | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai" | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 else request.app.state.config.RAG_OLLAMA_API_KEY | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |             ), | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.RAG_EMBEDDING_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-10 15:59:05 +08:00
										 |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "embedding_engine": request.app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |             "embedding_model": request.app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							|  |  |  |             "embedding_batch_size": request.app.state.config.RAG_EMBEDDING_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |             "openai_config": { | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 "url": request.app.state.config.RAG_OPENAI_API_BASE_URL, | 
					
						
							|  |  |  |                 "key": request.app.state.config.RAG_OPENAI_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |             "ollama_config": { | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 "url": request.app.state.config.RAG_OLLAMA_BASE_URL, | 
					
						
							|  |  |  |                 "key": request.app.state.config.RAG_OLLAMA_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-04-10 15:59:05 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(f"Problem updating embedding model: {e}") | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | class RerankingModelUpdateForm(BaseModel): | 
					
						
							|  |  |  |     reranking_model: str | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/reranking/update") | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | async def update_reranking_config( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request: Request, form_data: RerankingModelUpdateForm, user=Depends(get_admin_user) | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | ): | 
					
						
							|  |  |  |     log.info( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         f"Updating reranking model: {request.app.state.config.RAG_RERANKING_MODEL} to {form_data.reranking_model}" | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.RAG_RERANKING_MODEL = form_data.reranking_model | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |         try: | 
					
						
							|  |  |  |             request.app.state.rf = get_rf( | 
					
						
							|  |  |  |                 request.app.state.config.RAG_RERANKING_MODEL, | 
					
						
							|  |  |  |                 True, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         except Exception as e: | 
					
						
							|  |  |  |             log.error(f"Error loading reranking model: {e}") | 
					
						
							|  |  |  |             request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = False | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "reranking_model": request.app.state.config.RAG_RERANKING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(f"Problem updating reranking model: {e}") | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.get("/config") | 
					
						
							|  |  |  | async def get_rag_config(request: Request, user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         "pdf_extract_images": request.app.state.config.PDF_EXTRACT_IMAGES, | 
					
						
							| 
									
										
										
										
											2025-02-19 13:14:58 +08:00
										 |  |  |         "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT, | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |         "BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL, | 
					
						
							| 
									
										
										
										
											2024-12-27 03:41:58 +08:00
										 |  |  |         "enable_google_drive_integration": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION, | 
					
						
							| 
									
										
										
										
											2025-02-24 22:14:10 +08:00
										 |  |  |         "enable_onedrive_integration": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION, | 
					
						
							| 
									
										
										
										
											2024-07-02 08:11:09 +08:00
										 |  |  |         "content_extraction": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, | 
					
						
							|  |  |  |             "tika_server_url": request.app.state.config.TIKA_SERVER_URL, | 
					
						
							| 
									
										
										
										
											2025-02-14 20:08:03 +08:00
										 |  |  |             "docling_server_url": request.app.state.config.DOCLING_SERVER_URL, | 
					
						
							| 
									
										
										
										
											2025-02-07 20:44:47 +08:00
										 |  |  |             "document_intelligence_config": { | 
					
						
							|  |  |  |                 "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, | 
					
						
							|  |  |  |                 "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, | 
					
						
							|  |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-07-02 02:10:59 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  |         "chunk": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "text_splitter": request.app.state.config.TEXT_SPLITTER, | 
					
						
							|  |  |  |             "chunk_size": request.app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |             "chunk_overlap": request.app.state.config.CHUNK_OVERLAP, | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-10-13 19:24:13 +08:00
										 |  |  |         "file": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "max_size": request.app.state.config.FILE_MAX_SIZE, | 
					
						
							|  |  |  |             "max_count": request.app.state.config.FILE_MAX_COUNT, | 
					
						
							| 
									
										
										
										
											2024-10-13 19:24:13 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |         "youtube": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "language": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, | 
					
						
							|  |  |  |             "translation": request.app.state.YOUTUBE_LOADER_TRANSLATION, | 
					
						
							|  |  |  |             "proxy_url": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |         "web": { | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  |             "ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |             "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |             "search": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "enabled": request.app.state.config.ENABLE_RAG_WEB_SEARCH, | 
					
						
							| 
									
										
										
										
											2024-12-19 10:04:56 +08:00
										 |  |  |                 "drive": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION, | 
					
						
							| 
									
										
										
										
											2025-02-24 22:14:10 +08:00
										 |  |  |                 "onedrive": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "engine": request.app.state.config.RAG_WEB_SEARCH_ENGINE, | 
					
						
							|  |  |  |                 "searxng_query_url": request.app.state.config.SEARXNG_QUERY_URL, | 
					
						
							|  |  |  |                 "google_pse_api_key": request.app.state.config.GOOGLE_PSE_API_KEY, | 
					
						
							|  |  |  |                 "google_pse_engine_id": request.app.state.config.GOOGLE_PSE_ENGINE_ID, | 
					
						
							|  |  |  |                 "brave_search_api_key": request.app.state.config.BRAVE_SEARCH_API_KEY, | 
					
						
							|  |  |  |                 "kagi_search_api_key": request.app.state.config.KAGI_SEARCH_API_KEY, | 
					
						
							|  |  |  |                 "mojeek_search_api_key": request.app.state.config.MOJEEK_SEARCH_API_KEY, | 
					
						
							| 
									
										
										
										
											2025-02-10 16:44:47 +08:00
										 |  |  |                 "bocha_search_api_key": request.app.state.config.BOCHA_SEARCH_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "serpstack_api_key": request.app.state.config.SERPSTACK_API_KEY, | 
					
						
							|  |  |  |                 "serpstack_https": request.app.state.config.SERPSTACK_HTTPS, | 
					
						
							|  |  |  |                 "serper_api_key": request.app.state.config.SERPER_API_KEY, | 
					
						
							|  |  |  |                 "serply_api_key": request.app.state.config.SERPLY_API_KEY, | 
					
						
							|  |  |  |                 "tavily_api_key": request.app.state.config.TAVILY_API_KEY, | 
					
						
							|  |  |  |                 "searchapi_api_key": request.app.state.config.SEARCHAPI_API_KEY, | 
					
						
							| 
									
										
										
										
											2025-01-07 02:10:25 +08:00
										 |  |  |                 "searchapi_engine": request.app.state.config.SEARCHAPI_ENGINE, | 
					
						
							| 
									
										
										
										
											2025-02-14 12:24:58 +08:00
										 |  |  |                 "serpapi_api_key": request.app.state.config.SERPAPI_API_KEY, | 
					
						
							|  |  |  |                 "serpapi_engine": request.app.state.config.SERPAPI_ENGINE, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "jina_api_key": request.app.state.config.JINA_API_KEY, | 
					
						
							|  |  |  |                 "bing_search_v7_endpoint": request.app.state.config.BING_SEARCH_V7_ENDPOINT, | 
					
						
							|  |  |  |                 "bing_search_v7_subscription_key": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY, | 
					
						
							| 
									
										
										
										
											2025-02-05 02:13:05 +08:00
										 |  |  |                 "exa_api_key": request.app.state.config.EXA_API_KEY, | 
					
						
							| 
									
										
										
										
											2025-02-27 16:12:41 +08:00
										 |  |  |                 "perplexity_api_key": request.app.state.config.PERPLEXITY_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "result_count": request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							| 
									
										
										
										
											2025-02-22 05:40:11 +08:00
										 |  |  |                 "trust_env": request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "concurrent_requests": request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, | 
					
						
							| 
									
										
										
										
											2025-02-06 00:14:40 +08:00
										 |  |  |                 "domain_filter_list": request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-27 23:05:24 +08:00
										 |  |  | class FileConfig(BaseModel): | 
					
						
							|  |  |  |     max_size: Optional[int] = None | 
					
						
							|  |  |  |     max_count: Optional[int] = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-07 20:44:47 +08:00
										 |  |  | class DocumentIntelligenceConfigForm(BaseModel): | 
					
						
							|  |  |  |     endpoint: str | 
					
						
							|  |  |  |     key: str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-02 08:11:09 +08:00
										 |  |  | class ContentExtractionConfig(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-07-02 02:10:59 +08:00
										 |  |  |     engine: str = "" | 
					
						
							|  |  |  |     tika_server_url: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2025-02-14 20:08:03 +08:00
										 |  |  |     docling_server_url: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2025-02-07 20:44:47 +08:00
										 |  |  |     document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None | 
					
						
							| 
									
										
										
										
											2024-07-02 02:10:59 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  | class ChunkParamUpdateForm(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-10-13 19:24:13 +08:00
										 |  |  |     text_splitter: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     chunk_size: int | 
					
						
							|  |  |  |     chunk_overlap: int | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  | class YoutubeLoaderConfig(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-08-14 20:46:31 +08:00
										 |  |  |     language: list[str] | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |     translation: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-11-27 22:09:33 +08:00
										 |  |  |     proxy_url: str = "" | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  | class WebSearchConfig(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-06-02 11:08:08 +08:00
										 |  |  |     enabled: bool | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     engine: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     searxng_query_url: Optional[str] = None | 
					
						
							|  |  |  |     google_pse_api_key: Optional[str] = None | 
					
						
							|  |  |  |     google_pse_engine_id: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     brave_search_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-12-08 13:21:10 +08:00
										 |  |  |     kagi_search_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-10-29 22:45:38 +08:00
										 |  |  |     mojeek_search_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2025-02-10 16:44:47 +08:00
										 |  |  |     bocha_search_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     serpstack_api_key: Optional[str] = None | 
					
						
							|  |  |  |     serpstack_https: Optional[bool] = None | 
					
						
							|  |  |  |     serper_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |     serply_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-14 23:14:11 +08:00
										 |  |  |     tavily_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-08-27 15:45:17 +08:00
										 |  |  |     searchapi_api_key: Optional[str] = None | 
					
						
							|  |  |  |     searchapi_engine: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2025-02-14 12:24:58 +08:00
										 |  |  |     serpapi_api_key: Optional[str] = None | 
					
						
							|  |  |  |     serpapi_engine: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-11-04 09:07:24 +08:00
										 |  |  |     jina_api_key: Optional[str] = None | 
					
						
							|  |  |  |     bing_search_v7_endpoint: Optional[str] = None | 
					
						
							|  |  |  |     bing_search_v7_subscription_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2025-02-05 02:13:05 +08:00
										 |  |  |     exa_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2025-02-27 16:12:41 +08:00
										 |  |  |     perplexity_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     result_count: Optional[int] = None | 
					
						
							|  |  |  |     concurrent_requests: Optional[int] = None | 
					
						
							| 
									
										
										
										
											2025-02-14 15:15:09 +08:00
										 |  |  |     trust_env: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2025-02-06 00:14:40 +08:00
										 |  |  |     domain_filter_list: Optional[List[str]] = [] | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  | class WebConfig(BaseModel): | 
					
						
							|  |  |  |     search: WebSearchConfig | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  |     ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |     BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  | class ConfigUpdateForm(BaseModel): | 
					
						
							| 
									
										
										
										
											2025-02-19 13:14:58 +08:00
										 |  |  |     RAG_FULL_CONTEXT: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |     BYPASS_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  |     pdf_extract_images: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-12-19 10:04:56 +08:00
										 |  |  |     enable_google_drive_integration: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2025-02-24 22:14:10 +08:00
										 |  |  |     enable_onedrive_integration: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-08-27 23:05:24 +08:00
										 |  |  |     file: Optional[FileConfig] = None | 
					
						
							| 
									
										
										
										
											2024-07-02 08:11:09 +08:00
										 |  |  |     content_extraction: Optional[ContentExtractionConfig] = None | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  |     chunk: Optional[ChunkParamUpdateForm] = None | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |     youtube: Optional[YoutubeLoaderConfig] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     web: Optional[WebConfig] = None | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/config/update") | 
					
						
							|  |  |  | async def update_rag_config( | 
					
						
							|  |  |  |     request: Request, form_data: ConfigUpdateForm, user=Depends(get_admin_user) | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     request.app.state.config.PDF_EXTRACT_IMAGES = ( | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  |         form_data.pdf_extract_images | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         if form_data.pdf_extract_images is not None | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         else request.app.state.config.PDF_EXTRACT_IMAGES | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-19 13:14:58 +08:00
										 |  |  |     request.app.state.config.RAG_FULL_CONTEXT = ( | 
					
						
							|  |  |  |         form_data.RAG_FULL_CONTEXT | 
					
						
							|  |  |  |         if form_data.RAG_FULL_CONTEXT is not None | 
					
						
							|  |  |  |         else request.app.state.config.RAG_FULL_CONTEXT | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |     request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = ( | 
					
						
							|  |  |  |         form_data.BYPASS_EMBEDDING_AND_RETRIEVAL | 
					
						
							|  |  |  |         if form_data.BYPASS_EMBEDDING_AND_RETRIEVAL is not None | 
					
						
							|  |  |  |         else request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-19 10:04:56 +08:00
										 |  |  |     request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = ( | 
					
						
							| 
									
										
										
										
											2024-12-20 12:16:24 +08:00
										 |  |  |         form_data.enable_google_drive_integration | 
					
						
							|  |  |  |         if form_data.enable_google_drive_integration is not None | 
					
						
							| 
									
										
										
										
											2024-12-19 10:04:56 +08:00
										 |  |  |         else request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION | 
					
						
							| 
									
										
										
										
											2024-12-19 02:25:57 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-24 22:14:10 +08:00
										 |  |  |     request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION = ( | 
					
						
							|  |  |  |         form_data.enable_onedrive_integration | 
					
						
							|  |  |  |         if form_data.enable_onedrive_integration is not None | 
					
						
							|  |  |  |         else request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-27 23:05:24 +08:00
										 |  |  |     if form_data.file is not None: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.FILE_MAX_SIZE = form_data.file.max_size | 
					
						
							|  |  |  |         request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count | 
					
						
							| 
									
										
										
										
											2024-08-27 23:05:24 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-02 08:11:09 +08:00
										 |  |  |     if form_data.content_extraction is not None: | 
					
						
							| 
									
										
										
										
											2025-02-07 20:44:47 +08:00
										 |  |  |         log.info( | 
					
						
							|  |  |  |             f"Updating content extraction: {request.app.state.config.CONTENT_EXTRACTION_ENGINE} to {form_data.content_extraction.engine}" | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.CONTENT_EXTRACTION_ENGINE = ( | 
					
						
							|  |  |  |             form_data.content_extraction.engine | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         request.app.state.config.TIKA_SERVER_URL = ( | 
					
						
							|  |  |  |             form_data.content_extraction.tika_server_url | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-14 20:08:03 +08:00
										 |  |  |         request.app.state.config.DOCLING_SERVER_URL = ( | 
					
						
							|  |  |  |             form_data.content_extraction.docling_server_url | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-07 20:44:47 +08:00
										 |  |  |         if form_data.content_extraction.document_intelligence_config is not None: | 
					
						
							|  |  |  |             request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( | 
					
						
							|  |  |  |                 form_data.content_extraction.document_intelligence_config.endpoint | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |             request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = ( | 
					
						
							|  |  |  |                 form_data.content_extraction.document_intelligence_config.key | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-07-02 02:10:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     if form_data.chunk is not None: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter | 
					
						
							|  |  |  |         request.app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size | 
					
						
							|  |  |  |         request.app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     if form_data.youtube is not None: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language | 
					
						
							|  |  |  |         request.app.state.config.YOUTUBE_LOADER_PROXY_URL = form_data.youtube.proxy_url | 
					
						
							|  |  |  |         request.app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     if form_data.web is not None: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = ( | 
					
						
							| 
									
										
										
										
											2024-11-17 15:46:12 +08:00
										 |  |  |             # Note: When UI "Bypass SSL verification for Websites"=True then ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION=False | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  |             form_data.web.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.ENABLE_RAG_WEB_SEARCH = form_data.web.search.enabled | 
					
						
							|  |  |  |         request.app.state.config.RAG_WEB_SEARCH_ENGINE = form_data.web.search.engine | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |         request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = ( | 
					
						
							|  |  |  |             form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.SEARXNG_QUERY_URL = ( | 
					
						
							|  |  |  |             form_data.web.search.searxng_query_url | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         request.app.state.config.GOOGLE_PSE_API_KEY = ( | 
					
						
							|  |  |  |             form_data.web.search.google_pse_api_key | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         request.app.state.config.GOOGLE_PSE_ENGINE_ID = ( | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |             form_data.web.search.google_pse_engine_id | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.BRAVE_SEARCH_API_KEY = ( | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |             form_data.web.search.brave_search_api_key | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.KAGI_SEARCH_API_KEY = ( | 
					
						
							|  |  |  |             form_data.web.search.kagi_search_api_key | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         request.app.state.config.MOJEEK_SEARCH_API_KEY = ( | 
					
						
							| 
									
										
										
										
											2024-11-22 01:52:19 +08:00
										 |  |  |             form_data.web.search.mojeek_search_api_key | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-10 16:44:47 +08:00
										 |  |  |         request.app.state.config.BOCHA_SEARCH_API_KEY = ( | 
					
						
							|  |  |  |             form_data.web.search.bocha_search_api_key | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.SERPSTACK_API_KEY = ( | 
					
						
							|  |  |  |             form_data.web.search.serpstack_api_key | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         request.app.state.config.SERPSTACK_HTTPS = form_data.web.search.serpstack_https | 
					
						
							|  |  |  |         request.app.state.config.SERPER_API_KEY = form_data.web.search.serper_api_key | 
					
						
							|  |  |  |         request.app.state.config.SERPLY_API_KEY = form_data.web.search.serply_api_key | 
					
						
							|  |  |  |         request.app.state.config.TAVILY_API_KEY = form_data.web.search.tavily_api_key | 
					
						
							|  |  |  |         request.app.state.config.SEARCHAPI_API_KEY = ( | 
					
						
							|  |  |  |             form_data.web.search.searchapi_api_key | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         request.app.state.config.SEARCHAPI_ENGINE = ( | 
					
						
							|  |  |  |             form_data.web.search.searchapi_engine | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-14 14:54:45 +08:00
										 |  |  |         request.app.state.config.SERPAPI_API_KEY = form_data.web.search.serpapi_api_key | 
					
						
							|  |  |  |         request.app.state.config.SERPAPI_ENGINE = form_data.web.search.serpapi_engine | 
					
						
							| 
									
										
										
										
											2025-02-14 12:24:58 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.JINA_API_KEY = form_data.web.search.jina_api_key | 
					
						
							|  |  |  |         request.app.state.config.BING_SEARCH_V7_ENDPOINT = ( | 
					
						
							| 
									
										
										
										
											2024-11-04 09:07:24 +08:00
										 |  |  |             form_data.web.search.bing_search_v7_endpoint | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY = ( | 
					
						
							| 
									
										
										
										
											2024-11-04 09:07:24 +08:00
										 |  |  |             form_data.web.search.bing_search_v7_subscription_key | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-05 02:13:05 +08:00
										 |  |  |         request.app.state.config.EXA_API_KEY = form_data.web.search.exa_api_key | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-01 23:28:00 +08:00
										 |  |  |         request.app.state.config.PERPLEXITY_API_KEY = ( | 
					
						
							|  |  |  |             form_data.web.search.perplexity_api_key | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-27 16:12:41 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = ( | 
					
						
							|  |  |  |             form_data.web.search.result_count | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = ( | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |             form_data.web.search.concurrent_requests | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-14 15:15:09 +08:00
										 |  |  |         request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV = ( | 
					
						
							|  |  |  |             form_data.web.search.trust_env | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-06 00:14:40 +08:00
										 |  |  |         request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = ( | 
					
						
							|  |  |  |             form_data.web.search.domain_filter_list | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         "pdf_extract_images": request.app.state.config.PDF_EXTRACT_IMAGES, | 
					
						
							| 
									
										
										
										
											2025-02-19 13:14:58 +08:00
										 |  |  |         "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT, | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |         "BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL, | 
					
						
							| 
									
										
										
										
											2024-08-27 21:51:40 +08:00
										 |  |  |         "file": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "max_size": request.app.state.config.FILE_MAX_SIZE, | 
					
						
							|  |  |  |             "max_count": request.app.state.config.FILE_MAX_COUNT, | 
					
						
							| 
									
										
										
										
											2024-08-27 21:51:40 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-07-02 08:11:09 +08:00
										 |  |  |         "content_extraction": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, | 
					
						
							|  |  |  |             "tika_server_url": request.app.state.config.TIKA_SERVER_URL, | 
					
						
							| 
									
										
										
										
											2025-02-14 20:08:03 +08:00
										 |  |  |             "docling_server_url": request.app.state.config.DOCLING_SERVER_URL, | 
					
						
							| 
									
										
										
										
											2025-02-07 20:44:47 +08:00
										 |  |  |             "document_intelligence_config": { | 
					
						
							|  |  |  |                 "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, | 
					
						
							|  |  |  |                 "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, | 
					
						
							|  |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-07-02 02:10:59 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  |         "chunk": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "text_splitter": request.app.state.config.TEXT_SPLITTER, | 
					
						
							|  |  |  |             "chunk_size": request.app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |             "chunk_overlap": request.app.state.config.CHUNK_OVERLAP, | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |         "youtube": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             "language": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, | 
					
						
							|  |  |  |             "proxy_url": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, | 
					
						
							|  |  |  |             "translation": request.app.state.YOUTUBE_LOADER_TRANSLATION, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |         "web": { | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  |             "ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |             "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |             "search": { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "enabled": request.app.state.config.ENABLE_RAG_WEB_SEARCH, | 
					
						
							|  |  |  |                 "engine": request.app.state.config.RAG_WEB_SEARCH_ENGINE, | 
					
						
							|  |  |  |                 "searxng_query_url": request.app.state.config.SEARXNG_QUERY_URL, | 
					
						
							|  |  |  |                 "google_pse_api_key": request.app.state.config.GOOGLE_PSE_API_KEY, | 
					
						
							|  |  |  |                 "google_pse_engine_id": request.app.state.config.GOOGLE_PSE_ENGINE_ID, | 
					
						
							|  |  |  |                 "brave_search_api_key": request.app.state.config.BRAVE_SEARCH_API_KEY, | 
					
						
							|  |  |  |                 "kagi_search_api_key": request.app.state.config.KAGI_SEARCH_API_KEY, | 
					
						
							|  |  |  |                 "mojeek_search_api_key": request.app.state.config.MOJEEK_SEARCH_API_KEY, | 
					
						
							| 
									
										
										
										
											2025-02-10 16:44:47 +08:00
										 |  |  |                 "bocha_search_api_key": request.app.state.config.BOCHA_SEARCH_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "serpstack_api_key": request.app.state.config.SERPSTACK_API_KEY, | 
					
						
							|  |  |  |                 "serpstack_https": request.app.state.config.SERPSTACK_HTTPS, | 
					
						
							|  |  |  |                 "serper_api_key": request.app.state.config.SERPER_API_KEY, | 
					
						
							|  |  |  |                 "serply_api_key": request.app.state.config.SERPLY_API_KEY, | 
					
						
							|  |  |  |                 "serachapi_api_key": request.app.state.config.SEARCHAPI_API_KEY, | 
					
						
							|  |  |  |                 "searchapi_engine": request.app.state.config.SEARCHAPI_ENGINE, | 
					
						
							| 
									
										
										
										
											2025-02-14 12:24:58 +08:00
										 |  |  |                 "serpapi_api_key": request.app.state.config.SERPAPI_API_KEY, | 
					
						
							|  |  |  |                 "serpapi_engine": request.app.state.config.SERPAPI_ENGINE, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "tavily_api_key": request.app.state.config.TAVILY_API_KEY, | 
					
						
							|  |  |  |                 "jina_api_key": request.app.state.config.JINA_API_KEY, | 
					
						
							|  |  |  |                 "bing_search_v7_endpoint": request.app.state.config.BING_SEARCH_V7_ENDPOINT, | 
					
						
							|  |  |  |                 "bing_search_v7_subscription_key": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY, | 
					
						
							| 
									
										
										
										
											2025-02-05 02:13:05 +08:00
										 |  |  |                 "exa_api_key": request.app.state.config.EXA_API_KEY, | 
					
						
							| 
									
										
										
										
											2025-02-27 16:12:41 +08:00
										 |  |  |                 "perplexity_api_key": request.app.state.config.PERPLEXITY_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 "result_count": request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 "concurrent_requests": request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, | 
					
						
							| 
									
										
										
										
											2025-02-14 15:15:09 +08:00
										 |  |  |                 "trust_env": request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV, | 
					
						
							| 
									
										
										
										
											2025-02-06 00:14:40 +08:00
										 |  |  |                 "domain_filter_list": request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |             }, | 
					
						
							|  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.get("/template") | 
					
						
							|  |  |  | async def get_rag_template(request: Request, user=Depends(get_verified_user)): | 
					
						
							| 
									
										
										
										
											2024-02-18 14:41:03 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         "template": request.app.state.config.RAG_TEMPLATE, | 
					
						
							| 
									
										
										
										
											2024-02-18 14:41:03 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.get("/query/settings") | 
					
						
							|  |  |  | async def get_query_settings(request: Request, user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         "template": request.app.state.config.RAG_TEMPLATE, | 
					
						
							|  |  |  |         "k": request.app.state.config.TOP_K, | 
					
						
							|  |  |  |         "r": request.app.state.config.RELEVANCE_THRESHOLD, | 
					
						
							|  |  |  |         "hybrid": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH, | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-02-18 14:41:03 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  | class QuerySettingsForm(BaseModel): | 
					
						
							|  |  |  |     k: Optional[int] = None | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  |     r: Optional[float] = None | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  |     template: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-04-26 06:31:21 +08:00
										 |  |  |     hybrid: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/query/settings/update") | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  | async def update_query_settings( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request: Request, form_data: QuerySettingsForm, user=Depends(get_admin_user) | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request.app.state.config.RAG_TEMPLATE = form_data.template | 
					
						
							|  |  |  |     request.app.state.config.TOP_K = form_data.k if form_data.k else 4 | 
					
						
							|  |  |  |     request.app.state.config.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0 | 
					
						
							| 
									
										
										
										
											2024-10-13 19:24:13 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = ( | 
					
						
							| 
									
										
										
										
											2024-05-18 10:53:38 +08:00
										 |  |  |         form_data.hybrid if form_data.hybrid else False | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-08-02 21:36:17 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-26 06:31:21 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         "template": request.app.state.config.RAG_TEMPLATE, | 
					
						
							|  |  |  |         "k": request.app.state.config.TOP_K, | 
					
						
							|  |  |  |         "r": request.app.state.config.RELEVANCE_THRESHOLD, | 
					
						
							|  |  |  |         "hybrid": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH, | 
					
						
							| 
									
										
										
										
											2024-04-26 06:31:21 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | #################################### | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Document process and retrieval | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #################################### | 
					
						
							| 
									
										
										
										
											2024-02-02 05:35:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  | def save_docs_to_vector_db( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request: Request, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |     docs, | 
					
						
							|  |  |  |     collection_name, | 
					
						
							|  |  |  |     metadata: Optional[dict] = None, | 
					
						
							|  |  |  |     overwrite: bool = False, | 
					
						
							|  |  |  |     split: bool = True, | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  |     add: bool = False, | 
					
						
							| 
									
										
										
										
											2025-02-05 16:07:45 +08:00
										 |  |  |     user=None, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | ) -> bool: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     def _get_docs_info(docs: list[Document]) -> str: | 
					
						
							|  |  |  |         docs_info = set() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Trying to select relevant metadata identifying the document. | 
					
						
							|  |  |  |         for doc in docs: | 
					
						
							|  |  |  |             metadata = getattr(doc, "metadata", {}) | 
					
						
							|  |  |  |             doc_name = metadata.get("name", "") | 
					
						
							|  |  |  |             if not doc_name: | 
					
						
							|  |  |  |                 doc_name = metadata.get("title", "") | 
					
						
							|  |  |  |             if not doc_name: | 
					
						
							|  |  |  |                 doc_name = metadata.get("source", "") | 
					
						
							|  |  |  |             if doc_name: | 
					
						
							|  |  |  |                 docs_info.add(doc_name) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return ", ".join(docs_info) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-03 17:08:04 +08:00
										 |  |  |     log.info( | 
					
						
							|  |  |  |         f"save_docs_to_vector_db: document {_get_docs_info(docs)} {collection_name}" | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-03 21:53:21 +08:00
										 |  |  |     # Check if entries with the same hash (metadata.hash) already exist | 
					
						
							|  |  |  |     if metadata and "hash" in metadata: | 
					
						
							| 
									
										
										
										
											2024-10-04 14:06:47 +08:00
										 |  |  |         result = VECTOR_DB_CLIENT.query( | 
					
						
							| 
									
										
										
										
											2024-10-03 21:53:21 +08:00
										 |  |  |             collection_name=collection_name, | 
					
						
							|  |  |  |             filter={"hash": metadata["hash"]}, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-10-04 14:06:47 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-08 05:03:42 +08:00
										 |  |  |         if result is not None: | 
					
						
							| 
									
										
										
										
											2024-10-04 14:06:47 +08:00
										 |  |  |             existing_doc_ids = result.ids[0] | 
					
						
							|  |  |  |             if existing_doc_ids: | 
					
						
							|  |  |  |                 log.info(f"Document with hash {metadata['hash']} already exists") | 
					
						
							|  |  |  |                 raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT) | 
					
						
							| 
									
										
										
										
											2024-10-03 21:53:21 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |     if split: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.TEXT_SPLITTER in ["", "character"]: | 
					
						
							| 
									
										
										
										
											2024-10-13 17:07:50 +08:00
										 |  |  |             text_splitter = RecursiveCharacterTextSplitter( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 chunk_size=request.app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |                 chunk_overlap=request.app.state.config.CHUNK_OVERLAP, | 
					
						
							| 
									
										
										
										
											2024-10-13 17:07:50 +08:00
										 |  |  |                 add_start_index=True, | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         elif request.app.state.config.TEXT_SPLITTER == "token": | 
					
						
							| 
									
										
										
										
											2024-10-26 13:23:21 +08:00
										 |  |  |             log.info( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}" | 
					
						
							| 
									
										
										
										
											2024-10-26 13:23:21 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME)) | 
					
						
							| 
									
										
										
										
											2024-10-13 17:07:50 +08:00
										 |  |  |             text_splitter = TokenTextSplitter( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME), | 
					
						
							|  |  |  |                 chunk_size=request.app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |                 chunk_overlap=request.app.state.config.CHUNK_OVERLAP, | 
					
						
							| 
									
										
										
										
											2024-10-13 17:07:50 +08:00
										 |  |  |                 add_start_index=True, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter")) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |         docs = text_splitter.split_documents(docs) | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |     if len(docs) == 0: | 
					
						
							|  |  |  |         raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     texts = [doc.page_content for doc in docs] | 
					
						
							| 
									
										
										
										
											2024-10-13 18:25:11 +08:00
										 |  |  |     metadatas = [ | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             **doc.metadata, | 
					
						
							|  |  |  |             **(metadata if metadata else {}), | 
					
						
							|  |  |  |             "embedding_config": json.dumps( | 
					
						
							|  |  |  |                 { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                     "engine": request.app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |                     "model": request.app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-10-13 18:25:11 +08:00
										 |  |  |                 } | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         for doc in docs | 
					
						
							|  |  |  |     ] | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # ChromaDB does not like datetime formats | 
					
						
							|  |  |  |     # for meta-data so convert them to string. | 
					
						
							|  |  |  |     for metadata in metadatas: | 
					
						
							|  |  |  |         for key, value in metadata.items(): | 
					
						
							| 
									
										
										
										
											2025-02-14 13:45:29 +08:00
										 |  |  |             if ( | 
					
						
							|  |  |  |                 isinstance(value, datetime) | 
					
						
							|  |  |  |                 or isinstance(value, list) | 
					
						
							|  |  |  |                 or isinstance(value, dict) | 
					
						
							|  |  |  |             ): | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |                 metadata[key] = str(value) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |         if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name): | 
					
						
							|  |  |  |             log.info(f"collection {collection_name} already exists") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  |             if overwrite: | 
					
						
							|  |  |  |                 VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name) | 
					
						
							|  |  |  |                 log.info(f"deleting existing collection {collection_name}") | 
					
						
							| 
									
										
										
										
											2024-10-18 04:08:10 +08:00
										 |  |  |             elif add is False: | 
					
						
							| 
									
										
										
										
											2024-10-21 08:45:37 +08:00
										 |  |  |                 log.info( | 
					
						
							|  |  |  |                     f"collection {collection_name} already exists, overwrite is False and add is False" | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  |                 return True | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  |         log.info(f"adding to collection {collection_name}") | 
					
						
							|  |  |  |         embedding_function = get_embedding_function( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |             request.app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |             request.app.state.ef, | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |             ( | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 request.app.state.config.RAG_OPENAI_API_BASE_URL | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai" | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 else request.app.state.config.RAG_OLLAMA_BASE_URL | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |             ), | 
					
						
							|  |  |  |             ( | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 request.app.state.config.RAG_OPENAI_API_KEY | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai" | 
					
						
							| 
									
										
										
										
											2024-12-13 12:22:17 +08:00
										 |  |  |                 else request.app.state.config.RAG_OLLAMA_API_KEY | 
					
						
							| 
									
										
										
										
											2024-11-19 06:19:56 +08:00
										 |  |  |             ), | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.RAG_EMBEDDING_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         embeddings = embedding_function( | 
					
						
							| 
									
										
										
										
											2025-02-05 16:07:45 +08:00
										 |  |  |             list(map(lambda x: x.replace("\n", " "), texts)), user=user | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |         items = [ | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 "id": str(uuid.uuid4()), | 
					
						
							|  |  |  |                 "text": text, | 
					
						
							|  |  |  |                 "vector": embeddings[idx], | 
					
						
							| 
									
										
										
										
											2024-10-13 18:25:11 +08:00
										 |  |  |                 "metadata": metadatas[idx], | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |             } | 
					
						
							|  |  |  |             for idx, text in enumerate(texts) | 
					
						
							|  |  |  |         ] | 
					
						
							| 
									
										
										
										
											2024-10-04 15:46:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  |         VECTOR_DB_CLIENT.insert( | 
					
						
							|  |  |  |             collection_name=collection_name, | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |             items=items, | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return True | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							| 
									
										
										
										
											2024-12-01 14:29:53 +08:00
										 |  |  |         raise e | 
					
						
							| 
									
										
										
										
											2024-02-02 05:35:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | class ProcessFileForm(BaseModel): | 
					
						
							|  |  |  |     file_id: str | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |     content: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     collection_name: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-04-30 03:55:17 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/process/file") | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | def process_file( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request: Request, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     form_data: ProcessFileForm, | 
					
						
							|  |  |  |     user=Depends(get_verified_user), | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         file = Files.get_file_by_id(form_data.file_id) | 
					
						
							| 
									
										
										
										
											2024-06-12 16:37:53 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |         collection_name = form_data.collection_name | 
					
						
							| 
									
										
										
										
											2024-10-06 00:58:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |         if collection_name is None: | 
					
						
							| 
									
										
										
										
											2024-10-04 13:22:22 +08:00
										 |  |  |             collection_name = f"file-{file.id}" | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |         if form_data.content: | 
					
						
							| 
									
										
										
										
											2024-10-06 01:08:48 +08:00
										 |  |  |             # Update the content in the file | 
					
						
							|  |  |  |             # Usage: /files/{file_id}/data/content/update | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-27 05:09:52 +08:00
										 |  |  |             try: | 
					
						
							|  |  |  |                 # /files/{file_id}/data/content/update | 
					
						
							|  |  |  |                 VECTOR_DB_CLIENT.delete_collection(collection_name=f"file-{file.id}") | 
					
						
							|  |  |  |             except: | 
					
						
							|  |  |  |                 # Audio file upload pipeline | 
					
						
							|  |  |  |                 pass | 
					
						
							| 
									
										
										
										
											2024-10-06 01:05:12 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |             docs = [ | 
					
						
							|  |  |  |                 Document( | 
					
						
							| 
									
										
										
										
											2024-12-01 14:29:53 +08:00
										 |  |  |                     page_content=form_data.content.replace("<br/>", "\n"), | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |                     metadata={ | 
					
						
							| 
									
										
										
										
											2024-11-22 11:46:09 +08:00
										 |  |  |                         **file.meta, | 
					
						
							|  |  |  |                         "name": file.filename, | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |                         "created_by": file.user_id, | 
					
						
							| 
									
										
										
										
											2024-10-06 01:05:12 +08:00
										 |  |  |                         "file_id": file.id, | 
					
						
							| 
									
										
										
										
											2024-11-22 11:46:09 +08:00
										 |  |  |                         "source": file.filename, | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |                     }, | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             text_content = form_data.content | 
					
						
							| 
									
										
										
										
											2024-10-06 00:58:46 +08:00
										 |  |  |         elif form_data.collection_name: | 
					
						
							| 
									
										
										
										
											2024-10-06 01:08:48 +08:00
										 |  |  |             # Check if the file has already been processed and save the content | 
					
						
							|  |  |  |             # Usage: /knowledge/{id}/file/add, /knowledge/{id}/file/update | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-06 00:58:46 +08:00
										 |  |  |             result = VECTOR_DB_CLIENT.query( | 
					
						
							|  |  |  |                 collection_name=f"file-{file.id}", filter={"file_id": file.id} | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-08 05:03:42 +08:00
										 |  |  |             if result is not None and len(result.ids[0]) > 0: | 
					
						
							| 
									
										
										
										
											2024-10-06 00:58:46 +08:00
										 |  |  |                 docs = [ | 
					
						
							|  |  |  |                     Document( | 
					
						
							|  |  |  |                         page_content=result.documents[0][idx], | 
					
						
							|  |  |  |                         metadata=result.metadatas[0][idx], | 
					
						
							|  |  |  |                     ) | 
					
						
							|  |  |  |                     for idx, id in enumerate(result.ids[0]) | 
					
						
							|  |  |  |                 ] | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 docs = [ | 
					
						
							|  |  |  |                     Document( | 
					
						
							|  |  |  |                         page_content=file.data.get("content", ""), | 
					
						
							|  |  |  |                         metadata={ | 
					
						
							| 
									
										
										
										
											2024-11-22 11:46:09 +08:00
										 |  |  |                             **file.meta, | 
					
						
							|  |  |  |                             "name": file.filename, | 
					
						
							| 
									
										
										
										
											2024-10-06 00:58:46 +08:00
										 |  |  |                             "created_by": file.user_id, | 
					
						
							| 
									
										
										
										
											2024-10-06 01:05:12 +08:00
										 |  |  |                             "file_id": file.id, | 
					
						
							| 
									
										
										
										
											2024-11-22 11:46:09 +08:00
										 |  |  |                             "source": file.filename, | 
					
						
							| 
									
										
										
										
											2024-10-06 00:58:46 +08:00
										 |  |  |                         }, | 
					
						
							|  |  |  |                     ) | 
					
						
							|  |  |  |                 ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |             text_content = file.data.get("content", "") | 
					
						
							|  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2024-10-06 01:08:48 +08:00
										 |  |  |             # Process the file and save the content | 
					
						
							|  |  |  |             # Usage: /files/ | 
					
						
							| 
									
										
										
										
											2024-10-21 08:45:37 +08:00
										 |  |  |             file_path = file.path | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |             if file_path: | 
					
						
							| 
									
										
										
										
											2024-10-21 14:45:15 +08:00
										 |  |  |                 file_path = Storage.get_file(file_path) | 
					
						
							| 
									
										
										
										
											2024-10-06 00:58:46 +08:00
										 |  |  |                 loader = Loader( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                     engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, | 
					
						
							|  |  |  |                     TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, | 
					
						
							| 
									
										
										
										
											2025-02-14 20:08:03 +08:00
										 |  |  |                     DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                     PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, | 
					
						
							| 
									
										
										
										
											2025-02-07 20:44:47 +08:00
										 |  |  |                     DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, | 
					
						
							|  |  |  |                     DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, | 
					
						
							| 
									
										
										
										
											2024-10-06 00:58:46 +08:00
										 |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |                 docs = loader.load( | 
					
						
							|  |  |  |                     file.filename, file.meta.get("content_type"), file_path | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-11-22 11:46:09 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |                 docs = [ | 
					
						
							|  |  |  |                     Document( | 
					
						
							|  |  |  |                         page_content=doc.page_content, | 
					
						
							|  |  |  |                         metadata={ | 
					
						
							|  |  |  |                             **doc.metadata, | 
					
						
							|  |  |  |                             "name": file.filename, | 
					
						
							|  |  |  |                             "created_by": file.user_id, | 
					
						
							|  |  |  |                             "file_id": file.id, | 
					
						
							|  |  |  |                             "source": file.filename, | 
					
						
							|  |  |  |                         }, | 
					
						
							|  |  |  |                     ) | 
					
						
							|  |  |  |                     for doc in docs | 
					
						
							|  |  |  |                 ] | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 docs = [ | 
					
						
							|  |  |  |                     Document( | 
					
						
							|  |  |  |                         page_content=file.data.get("content", ""), | 
					
						
							|  |  |  |                         metadata={ | 
					
						
							| 
									
										
										
										
											2024-11-22 11:46:09 +08:00
										 |  |  |                             **file.meta, | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |                             "name": file.filename, | 
					
						
							|  |  |  |                             "created_by": file.user_id, | 
					
						
							| 
									
										
										
										
											2024-10-06 01:05:12 +08:00
										 |  |  |                             "file_id": file.id, | 
					
						
							| 
									
										
										
										
											2024-11-22 11:46:09 +08:00
										 |  |  |                             "source": file.filename, | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |                         }, | 
					
						
							|  |  |  |                     ) | 
					
						
							|  |  |  |                 ] | 
					
						
							|  |  |  |             text_content = " ".join([doc.page_content for doc in docs]) | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 16:53:25 +08:00
										 |  |  |         log.debug(f"text_content: {text_content}") | 
					
						
							| 
									
										
										
										
											2024-10-04 13:22:22 +08:00
										 |  |  |         Files.update_file_data_by_id( | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  |             file.id, | 
					
						
							| 
									
										
										
										
											2024-10-02 04:13:39 +08:00
										 |  |  |             {"content": text_content}, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:56:56 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-04 15:23:14 +08:00
										 |  |  |         hash = calculate_sha256_string(text_content) | 
					
						
							| 
									
										
										
										
											2024-10-04 13:22:22 +08:00
										 |  |  |         Files.update_file_hash_by_id(file.id, hash) | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |         if not request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL: | 
					
						
							|  |  |  |             try: | 
					
						
							|  |  |  |                 result = save_docs_to_vector_db( | 
					
						
							|  |  |  |                     request, | 
					
						
							|  |  |  |                     docs=docs, | 
					
						
							|  |  |  |                     collection_name=collection_name, | 
					
						
							|  |  |  |                     metadata={ | 
					
						
							|  |  |  |                         "file_id": file.id, | 
					
						
							|  |  |  |                         "name": file.filename, | 
					
						
							|  |  |  |                         "hash": hash, | 
					
						
							| 
									
										
										
										
											2024-10-04 13:22:22 +08:00
										 |  |  |                     }, | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |                     add=(True if form_data.collection_name else False), | 
					
						
							|  |  |  |                     user=user, | 
					
						
							| 
									
										
										
										
											2024-10-04 13:22:22 +08:00
										 |  |  |                 ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |                 if result: | 
					
						
							|  |  |  |                     Files.update_file_metadata_by_id( | 
					
						
							|  |  |  |                         file.id, | 
					
						
							|  |  |  |                         { | 
					
						
							|  |  |  |                             "collection_name": collection_name, | 
					
						
							|  |  |  |                         }, | 
					
						
							|  |  |  |                     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                     return { | 
					
						
							|  |  |  |                         "status": True, | 
					
						
							|  |  |  |                         "collection_name": collection_name, | 
					
						
							|  |  |  |                         "filename": file.filename, | 
					
						
							|  |  |  |                         "content": text_content, | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |             except Exception as e: | 
					
						
							|  |  |  |                 raise e | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             return { | 
					
						
							|  |  |  |                 "status": True, | 
					
						
							|  |  |  |                 "collection_name": None, | 
					
						
							|  |  |  |                 "filename": file.filename, | 
					
						
							|  |  |  |                 "content": text_content, | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  |         if "No pandoc was found" in str(e): | 
					
						
							|  |  |  |             raise HTTPException( | 
					
						
							|  |  |  |                 status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |                 detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise HTTPException( | 
					
						
							|  |  |  |                 status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							| 
									
										
										
										
											2024-10-04 12:10:33 +08:00
										 |  |  |                 detail=str(e), | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:29:08 +08:00
										 |  |  | class ProcessTextForm(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     name: str | 
					
						
							|  |  |  |     content: str | 
					
						
							|  |  |  |     collection_name: Optional[str] = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/process/text") | 
					
						
							| 
									
										
										
										
											2024-09-28 08:29:08 +08:00
										 |  |  | def process_text( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request: Request, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:29:08 +08:00
										 |  |  |     form_data: ProcessTextForm, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     user=Depends(get_verified_user), | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     collection_name = form_data.collection_name | 
					
						
							|  |  |  |     if collection_name is None: | 
					
						
							|  |  |  |         collection_name = calculate_sha256_string(form_data.content) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |     docs = [ | 
					
						
							|  |  |  |         Document( | 
					
						
							|  |  |  |             page_content=form_data.content, | 
					
						
							|  |  |  |             metadata={"name": form_data.name, "created_by": user.id}, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |     ] | 
					
						
							| 
									
										
										
										
											2024-09-30 00:55:26 +08:00
										 |  |  |     text_content = form_data.content | 
					
						
							|  |  |  |     log.debug(f"text_content: {text_content}") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-01-29 18:55:52 +08:00
										 |  |  |     result = save_docs_to_vector_db(request, docs, collection_name, user=user) | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     if result: | 
					
						
							| 
									
										
										
										
											2024-09-30 00:55:26 +08:00
										 |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							|  |  |  |             "collection_name": collection_name, | 
					
						
							|  |  |  |             "content": text_content, | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     else: | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/process/youtube") | 
					
						
							|  |  |  | def process_youtube_video( | 
					
						
							|  |  |  |     request: Request, form_data: ProcessUrlForm, user=Depends(get_verified_user) | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |         collection_name = form_data.collection_name | 
					
						
							|  |  |  |         if not collection_name: | 
					
						
							|  |  |  |             collection_name = calculate_sha256_string(form_data.url)[:63] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-21 02:02:07 +08:00
										 |  |  |         loader = YoutubeLoader( | 
					
						
							| 
									
										
										
										
											2024-11-27 22:09:33 +08:00
										 |  |  |             form_data.url, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             language=request.app.state.config.YOUTUBE_LOADER_LANGUAGE, | 
					
						
							|  |  |  |             proxy_url=request.app.state.config.YOUTUBE_LOADER_PROXY_URL, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-11-20 22:53:11 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |         docs = loader.load() | 
					
						
							| 
									
										
										
										
											2024-10-07 10:44:02 +08:00
										 |  |  |         content = " ".join([doc.page_content for doc in docs]) | 
					
						
							|  |  |  |         log.debug(f"text_content: {content}") | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-05 16:07:45 +08:00
										 |  |  |         save_docs_to_vector_db( | 
					
						
							|  |  |  |             request, docs, collection_name, overwrite=True, user=user | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-09-28 08:29:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							|  |  |  |             "collection_name": collection_name, | 
					
						
							|  |  |  |             "filename": form_data.url, | 
					
						
							| 
									
										
										
										
											2024-10-07 10:44:02 +08:00
										 |  |  |             "file": { | 
					
						
							|  |  |  |                 "data": { | 
					
						
							|  |  |  |                     "content": content, | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 "meta": { | 
					
						
							|  |  |  |                     "name": form_data.url, | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/process/web") | 
					
						
							|  |  |  | def process_web( | 
					
						
							|  |  |  |     request: Request, form_data: ProcessUrlForm, user=Depends(get_verified_user) | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |         collection_name = form_data.collection_name | 
					
						
							|  |  |  |         if not collection_name: | 
					
						
							|  |  |  |             collection_name = calculate_sha256_string(form_data.url)[:63] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |         loader = get_web_loader( | 
					
						
							|  |  |  |             form_data.url, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             verify_ssl=request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, | 
					
						
							|  |  |  |             requests_per_second=request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |         docs = loader.load() | 
					
						
							| 
									
										
										
										
											2024-10-07 10:44:02 +08:00
										 |  |  |         content = " ".join([doc.page_content for doc in docs]) | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-07 10:44:02 +08:00
										 |  |  |         log.debug(f"text_content: {content}") | 
					
						
							| 
									
										
										
										
											2025-02-28 08:34:05 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if not request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: | 
					
						
							|  |  |  |             save_docs_to_vector_db( | 
					
						
							|  |  |  |                 request, docs, collection_name, overwrite=True, user=user | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             collection_name = None | 
					
						
							| 
									
										
										
										
											2024-09-28 08:29:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							|  |  |  |             "collection_name": collection_name, | 
					
						
							|  |  |  |             "filename": form_data.url, | 
					
						
							| 
									
										
										
										
											2024-10-07 10:44:02 +08:00
										 |  |  |             "file": { | 
					
						
							|  |  |  |                 "data": { | 
					
						
							|  |  |  |                     "content": content, | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 "meta": { | 
					
						
							|  |  |  |                     "name": form_data.url, | 
					
						
							| 
									
										
										
										
											2025-02-28 08:34:05 +08:00
										 |  |  |                     "source": form_data.url, | 
					
						
							| 
									
										
										
										
											2024-10-07 10:44:02 +08:00
										 |  |  |                 }, | 
					
						
							|  |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-06-13 02:08:05 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | def search_web(request: Request, engine: str, query: str) -> list[SearchResult]: | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |     """Search the web using a search engine and return the results as a list of SearchResult objects.
 | 
					
						
							|  |  |  |     Will look for a search engine API key in environment variables in the following order: | 
					
						
							|  |  |  |     - SEARXNG_QUERY_URL | 
					
						
							|  |  |  |     - GOOGLE_PSE_API_KEY + GOOGLE_PSE_ENGINE_ID | 
					
						
							|  |  |  |     - BRAVE_SEARCH_API_KEY | 
					
						
							| 
									
										
										
										
											2024-12-08 13:21:10 +08:00
										 |  |  |     - KAGI_SEARCH_API_KEY | 
					
						
							| 
									
										
										
										
											2024-10-29 22:45:38 +08:00
										 |  |  |     - MOJEEK_SEARCH_API_KEY | 
					
						
							| 
									
										
										
										
											2025-02-10 16:44:47 +08:00
										 |  |  |     - BOCHA_SEARCH_API_KEY | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |     - SERPSTACK_API_KEY | 
					
						
							|  |  |  |     - SERPER_API_KEY | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |     - SERPLY_API_KEY | 
					
						
							| 
									
										
										
										
											2024-06-14 23:14:11 +08:00
										 |  |  |     - TAVILY_API_KEY | 
					
						
							| 
									
										
										
										
											2025-02-05 02:13:05 +08:00
										 |  |  |     - EXA_API_KEY | 
					
						
							| 
									
										
										
										
											2025-02-27 16:12:41 +08:00
										 |  |  |     - PERPLEXITY_API_KEY | 
					
						
							| 
									
										
										
										
											2024-08-27 15:45:17 +08:00
										 |  |  |     - SEARCHAPI_API_KEY + SEARCHAPI_ENGINE (by default `google`) | 
					
						
							| 
									
										
										
										
											2025-02-14 12:24:58 +08:00
										 |  |  |     - SERPAPI_API_KEY + SERPAPI_ENGINE (by default `google`) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |     Args: | 
					
						
							|  |  |  |         query (str): The query to search for | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # TODO: add playwright to search the web | 
					
						
							|  |  |  |     if engine == "searxng": | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.SEARXNG_QUERY_URL: | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |             return search_searxng( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.SEARXNG_QUERY_URL, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SEARXNG_QUERY_URL found in environment variables") | 
					
						
							|  |  |  |     elif engine == "google_pse": | 
					
						
							|  |  |  |         if ( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.GOOGLE_PSE_API_KEY | 
					
						
							|  |  |  |             and request.app.state.config.GOOGLE_PSE_ENGINE_ID | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         ): | 
					
						
							|  |  |  |             return search_google_pse( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.GOOGLE_PSE_API_KEY, | 
					
						
							|  |  |  |                 request.app.state.config.GOOGLE_PSE_ENGINE_ID, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception( | 
					
						
							|  |  |  |                 "No GOOGLE_PSE_API_KEY or GOOGLE_PSE_ENGINE_ID found in environment variables" | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |     elif engine == "brave": | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.BRAVE_SEARCH_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |             return search_brave( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.BRAVE_SEARCH_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No BRAVE_SEARCH_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2024-12-08 13:21:10 +08:00
										 |  |  |     elif engine == "kagi": | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.KAGI_SEARCH_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-12-08 13:21:10 +08:00
										 |  |  |             return search_kagi( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.KAGI_SEARCH_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-12-08 13:21:10 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-12-08 13:21:10 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No KAGI_SEARCH_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2024-10-29 22:45:38 +08:00
										 |  |  |     elif engine == "mojeek": | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.MOJEEK_SEARCH_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-10-29 22:45:38 +08:00
										 |  |  |             return search_mojeek( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.MOJEEK_SEARCH_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-10-29 22:45:38 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-10-29 22:45:38 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No MOJEEK_SEARCH_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2025-02-10 16:44:47 +08:00
										 |  |  |     elif engine == "bocha": | 
					
						
							|  |  |  |         if request.app.state.config.BOCHA_SEARCH_API_KEY: | 
					
						
							|  |  |  |             return search_bocha( | 
					
						
							|  |  |  |                 request.app.state.config.BOCHA_SEARCH_API_KEY, | 
					
						
							|  |  |  |                 query, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No BOCHA_SEARCH_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |     elif engine == "serpstack": | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.SERPSTACK_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |             return search_serpstack( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.SERPSTACK_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							|  |  |  |                 https_enabled=request.app.state.config.SERPSTACK_HTTPS, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SERPSTACK_API_KEY found in environment variables") | 
					
						
							|  |  |  |     elif engine == "serper": | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.SERPER_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |             return search_serper( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.SERPER_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SERPER_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |     elif engine == "serply": | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.SERPLY_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |             return search_serply( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.SERPLY_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SERPLY_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2024-06-11 22:19:08 +08:00
										 |  |  |     elif engine == "duckduckgo": | 
					
						
							| 
									
										
										
										
											2024-06-18 05:32:23 +08:00
										 |  |  |         return search_duckduckgo( | 
					
						
							|  |  |  |             query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |             request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-06-18 05:32:23 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-06-14 23:14:11 +08:00
										 |  |  |     elif engine == "tavily": | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.TAVILY_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-06-14 23:14:11 +08:00
										 |  |  |             return search_tavily( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.TAVILY_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-14 23:14:11 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							| 
									
										
										
										
											2025-02-18 11:20:49 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-06-14 23:14:11 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No TAVILY_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2024-08-27 15:45:17 +08:00
										 |  |  |     elif engine == "searchapi": | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.SEARCHAPI_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-08-27 15:45:17 +08:00
										 |  |  |             return search_searchapi( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.SEARCHAPI_API_KEY, | 
					
						
							|  |  |  |                 request.app.state.config.SEARCHAPI_ENGINE, | 
					
						
							| 
									
										
										
										
											2024-08-27 15:45:17 +08:00
										 |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-08-27 15:45:17 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SEARCHAPI_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2025-02-14 12:24:58 +08:00
										 |  |  |     elif engine == "serpapi": | 
					
						
							|  |  |  |         if request.app.state.config.SERPAPI_API_KEY: | 
					
						
							|  |  |  |             return search_serpapi( | 
					
						
							|  |  |  |                 request.app.state.config.SERPAPI_API_KEY, | 
					
						
							|  |  |  |                 request.app.state.config.SERPAPI_ENGINE, | 
					
						
							|  |  |  |                 query, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SERPAPI_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2024-06-22 22:36:15 +08:00
										 |  |  |     elif engine == "jina": | 
					
						
							| 
									
										
										
										
											2024-11-04 09:07:24 +08:00
										 |  |  |         return search_jina( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.JINA_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-11-04 09:07:24 +08:00
										 |  |  |             query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							| 
									
										
										
										
											2024-11-04 09:07:24 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-10-28 17:33:52 +08:00
										 |  |  |     elif engine == "bing": | 
					
						
							|  |  |  |         return search_bing( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY, | 
					
						
							|  |  |  |             request.app.state.config.BING_SEARCH_V7_ENDPOINT, | 
					
						
							| 
									
										
										
										
											2024-10-28 17:33:52 +08:00
										 |  |  |             str(DEFAULT_LOCALE), | 
					
						
							|  |  |  |             query, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |             request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							| 
									
										
										
										
											2024-10-28 17:33:52 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-05 02:13:05 +08:00
										 |  |  |     elif engine == "exa": | 
					
						
							|  |  |  |         return search_exa( | 
					
						
							|  |  |  |             request.app.state.config.EXA_API_KEY, | 
					
						
							|  |  |  |             query, | 
					
						
							|  |  |  |             request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |             request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-27 16:12:41 +08:00
										 |  |  |     elif engine == "perplexity": | 
					
						
							|  |  |  |         return search_perplexity( | 
					
						
							|  |  |  |             request.app.state.config.PERPLEXITY_API_KEY, | 
					
						
							|  |  |  |             query, | 
					
						
							|  |  |  |             request.app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |             request.app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |     else: | 
					
						
							|  |  |  |         raise Exception("No search engine API key found in environment variables") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/process/web/search") | 
					
						
							| 
									
										
										
										
											2025-01-29 13:03:15 +08:00
										 |  |  | async def process_web_search( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request: Request, form_data: SearchForm, user=Depends(get_verified_user) | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-06-12 15:18:22 +08:00
										 |  |  |         logging.info( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             f"trying to web search with {request.app.state.config.RAG_WEB_SEARCH_ENGINE, form_data.query}" | 
					
						
							| 
									
										
										
										
											2024-06-12 15:18:22 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         web_results = search_web( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             request, request.app.state.config.RAG_WEB_SEARCH_ENGINE, form_data.query | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.WEB_SEARCH_ERROR(e), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-25 08:52:57 +08:00
										 |  |  |     log.debug(f"web_results: {web_results}") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  |         collection_name = form_data.collection_name | 
					
						
							| 
									
										
										
										
											2024-12-25 08:52:57 +08:00
										 |  |  |         if collection_name == "" or collection_name is None: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             collection_name = f"web-search-{calculate_sha256_string(form_data.query)}"[ | 
					
						
							|  |  |  |                 :63 | 
					
						
							|  |  |  |             ] | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  |         urls = [result.link for result in web_results] | 
					
						
							| 
									
										
										
										
											2024-11-17 15:46:12 +08:00
										 |  |  |         loader = get_web_loader( | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  |             urls, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |             verify_ssl=request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, | 
					
						
							|  |  |  |             requests_per_second=request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, | 
					
						
							| 
									
										
										
										
											2025-02-14 15:15:09 +08:00
										 |  |  |             trust_env=request.app.state.config.RAG_WEB_SEARCH_TRUST_ENV, | 
					
						
							| 
									
										
										
										
											2024-11-17 15:46:12 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-02-14 15:05:10 +08:00
										 |  |  |         docs = await loader.aload() | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |         if request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  |             return { | 
					
						
							|  |  |  |                 "status": True, | 
					
						
							| 
									
										
										
										
											2025-02-27 07:42:19 +08:00
										 |  |  |                 "collection_name": None, | 
					
						
							|  |  |  |                 "filenames": urls, | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  |                 "docs": [ | 
					
						
							|  |  |  |                     { | 
					
						
							|  |  |  |                         "content": doc.page_content, | 
					
						
							|  |  |  |                         "metadata": doc.metadata, | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                     for doc in docs | 
					
						
							|  |  |  |                 ], | 
					
						
							|  |  |  |                 "loaded_count": len(docs), | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             await run_in_threadpool( | 
					
						
							|  |  |  |                 save_docs_to_vector_db, | 
					
						
							|  |  |  |                 request, | 
					
						
							|  |  |  |                 docs, | 
					
						
							|  |  |  |                 collection_name, | 
					
						
							|  |  |  |                 overwrite=True, | 
					
						
							| 
									
										
										
										
											2025-02-19 13:14:58 +08:00
										 |  |  |                 user=user, | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-09-28 08:38:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-02-18 10:14:26 +08:00
										 |  |  |             return { | 
					
						
							|  |  |  |                 "status": True, | 
					
						
							|  |  |  |                 "collection_name": collection_name, | 
					
						
							|  |  |  |                 "filenames": urls, | 
					
						
							|  |  |  |                 "loaded_count": len(docs), | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | class QueryDocForm(BaseModel): | 
					
						
							|  |  |  |     collection_name: str | 
					
						
							|  |  |  |     query: str | 
					
						
							|  |  |  |     k: Optional[int] = None | 
					
						
							|  |  |  |     r: Optional[float] = None | 
					
						
							|  |  |  |     hybrid: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/query/doc") | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | def query_doc_handler( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request: Request, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     form_data: QueryDocForm, | 
					
						
							|  |  |  |     user=Depends(get_verified_user), | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.ENABLE_RAG_HYBRID_SEARCH: | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |             return query_doc_with_hybrid_search( | 
					
						
							|  |  |  |                 collection_name=form_data.collection_name, | 
					
						
							|  |  |  |                 query=form_data.query, | 
					
						
							| 
									
										
										
										
											2025-02-05 16:07:45 +08:00
										 |  |  |                 embedding_function=lambda query: request.app.state.EMBEDDING_FUNCTION( | 
					
						
							|  |  |  |                     query, user=user | 
					
						
							|  |  |  |                 ), | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 k=form_data.k if form_data.k else request.app.state.config.TOP_K, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |                 reranking_function=request.app.state.rf, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |                 r=( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                     form_data.r | 
					
						
							|  |  |  |                     if form_data.r | 
					
						
							|  |  |  |                     else request.app.state.config.RELEVANCE_THRESHOLD | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |                 ), | 
					
						
							| 
									
										
										
										
											2025-02-05 16:07:45 +08:00
										 |  |  |                 user=user, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             return query_doc( | 
					
						
							|  |  |  |                 collection_name=form_data.collection_name, | 
					
						
							| 
									
										
										
										
											2025-02-05 16:07:45 +08:00
										 |  |  |                 query_embedding=request.app.state.EMBEDDING_FUNCTION( | 
					
						
							|  |  |  |                     form_data.query, user=user | 
					
						
							|  |  |  |                 ), | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 k=form_data.k if form_data.k else request.app.state.config.TOP_K, | 
					
						
							| 
									
										
										
										
											2025-02-05 16:07:45 +08:00
										 |  |  |                 user=user, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | class QueryCollectionsForm(BaseModel): | 
					
						
							|  |  |  |     collection_names: list[str] | 
					
						
							|  |  |  |     query: str | 
					
						
							|  |  |  |     k: Optional[int] = None | 
					
						
							|  |  |  |     r: Optional[float] = None | 
					
						
							|  |  |  |     hybrid: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/query/collection") | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | def query_collection_handler( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     request: Request, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |     form_data: QueryCollectionsForm, | 
					
						
							| 
									
										
										
										
											2024-06-28 02:29:59 +08:00
										 |  |  |     user=Depends(get_verified_user), | 
					
						
							| 
									
										
										
										
											2024-01-07 18:46:12 +08:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |         if request.app.state.config.ENABLE_RAG_HYBRID_SEARCH: | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |             return query_collection_with_hybrid_search( | 
					
						
							|  |  |  |                 collection_names=form_data.collection_names, | 
					
						
							| 
									
										
										
										
											2024-12-02 05:36:36 +08:00
										 |  |  |                 queries=[form_data.query], | 
					
						
							| 
									
										
										
										
											2025-02-05 16:07:45 +08:00
										 |  |  |                 embedding_function=lambda query: request.app.state.EMBEDDING_FUNCTION( | 
					
						
							|  |  |  |                     query, user=user | 
					
						
							|  |  |  |                 ), | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 k=form_data.k if form_data.k else request.app.state.config.TOP_K, | 
					
						
							| 
									
										
										
										
											2024-12-12 10:46:29 +08:00
										 |  |  |                 reranking_function=request.app.state.rf, | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |                 r=( | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                     form_data.r | 
					
						
							|  |  |  |                     if form_data.r | 
					
						
							|  |  |  |                     else request.app.state.config.RELEVANCE_THRESHOLD | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |                 ), | 
					
						
							| 
									
										
										
										
											2024-01-13 21:46:56 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |             return query_collection( | 
					
						
							|  |  |  |                 collection_names=form_data.collection_names, | 
					
						
							| 
									
										
										
										
											2024-12-02 05:36:36 +08:00
										 |  |  |                 queries=[form_data.query], | 
					
						
							| 
									
										
										
										
											2025-02-05 16:07:45 +08:00
										 |  |  |                 embedding_function=lambda query: request.app.state.EMBEDDING_FUNCTION( | 
					
						
							|  |  |  |                     query, user=user | 
					
						
							|  |  |  |                 ), | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |                 k=form_data.k if form_data.k else request.app.state.config.TOP_K, | 
					
						
							| 
									
										
										
										
											2024-07-15 19:05:38 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-06-19 04:50:18 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |         raise HTTPException( | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-28 08:23:09 +08:00
										 |  |  | #################################### | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Vector DB operations | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #################################### | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  | class DeleteForm(BaseModel): | 
					
						
							|  |  |  |     collection_name: str | 
					
						
							|  |  |  |     file_id: str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/delete") | 
					
						
							| 
									
										
										
										
											2024-10-03 21:44:17 +08:00
										 |  |  | def delete_entries_from_collection(form_data: DeleteForm, user=Depends(get_admin_user)): | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         if VECTOR_DB_CLIENT.has_collection(collection_name=form_data.collection_name): | 
					
						
							|  |  |  |             file = Files.get_file_by_id(form_data.file_id) | 
					
						
							|  |  |  |             hash = file.hash | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             VECTOR_DB_CLIENT.delete( | 
					
						
							|  |  |  |                 collection_name=form_data.collection_name, | 
					
						
							|  |  |  |                 metadata={"hash": hash}, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |             return {"status": True} | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             return {"status": False} | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  |         return {"status": False} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/reset/db") | 
					
						
							| 
									
										
										
										
											2024-02-09 08:05:01 +08:00
										 |  |  | def reset_vector_db(user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-09-10 09:27:50 +08:00
										 |  |  |     VECTOR_DB_CLIENT.reset() | 
					
						
							| 
									
										
										
										
											2024-10-13 18:02:02 +08:00
										 |  |  |     Knowledges.delete_all_knowledge() | 
					
						
							| 
									
										
										
										
											2024-01-07 17:40:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  | @router.post("/reset/uploads") | 
					
						
							| 
									
										
										
										
											2024-06-04 12:45:36 +08:00
										 |  |  | def reset_upload_dir(user=Depends(get_admin_user)) -> bool: | 
					
						
							|  |  |  |     folder = f"{UPLOAD_DIR}" | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         # Check if the directory exists | 
					
						
							|  |  |  |         if os.path.exists(folder): | 
					
						
							|  |  |  |             # Iterate over all the files and directories in the specified directory | 
					
						
							|  |  |  |             for filename in os.listdir(folder): | 
					
						
							|  |  |  |                 file_path = os.path.join(folder, filename) | 
					
						
							|  |  |  |                 try: | 
					
						
							|  |  |  |                     if os.path.isfile(file_path) or os.path.islink(file_path): | 
					
						
							|  |  |  |                         os.unlink(file_path)  # Remove the file or link | 
					
						
							|  |  |  |                     elif os.path.isdir(file_path): | 
					
						
							|  |  |  |                         shutil.rmtree(file_path)  # Remove the directory | 
					
						
							|  |  |  |                 except Exception as e: | 
					
						
							| 
									
										
										
										
											2025-02-25 22:36:25 +08:00
										 |  |  |                     log.exception(f"Failed to delete {file_path}. Reason: {e}") | 
					
						
							| 
									
										
										
										
											2024-06-04 12:45:36 +08:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2025-02-25 22:36:25 +08:00
										 |  |  |             log.warning(f"The directory {folder} does not exist") | 
					
						
							| 
									
										
										
										
											2024-06-04 12:45:36 +08:00
										 |  |  |     except Exception as e: | 
					
						
							| 
									
										
										
										
											2025-02-25 22:36:25 +08:00
										 |  |  |         log.exception(f"Failed to process the directory {folder}. Reason: {e}") | 
					
						
							| 
									
										
										
										
											2024-02-09 08:05:01 +08:00
										 |  |  |     return True | 
					
						
							| 
									
										
										
										
											2024-05-19 21:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-12 15:18:22 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 21:51:32 +08:00
										 |  |  | if ENV == "dev": | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-12 10:05:42 +08:00
										 |  |  |     @router.get("/ef/{text}") | 
					
						
							|  |  |  |     async def get_embeddings(request: Request, text: Optional[str] = "Hello World!"): | 
					
						
							|  |  |  |         return {"result": request.app.state.EMBEDDING_FUNCTION(text)} | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  | class BatchProcessFilesForm(BaseModel): | 
					
						
							|  |  |  |     files: List[FileModel] | 
					
						
							|  |  |  |     collection_name: str | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  | class BatchProcessFilesResult(BaseModel): | 
					
						
							|  |  |  |     file_id: str | 
					
						
							|  |  |  |     status: str | 
					
						
							|  |  |  |     error: Optional[str] = None | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  | class BatchProcessFilesResponse(BaseModel): | 
					
						
							|  |  |  |     results: List[BatchProcessFilesResult] | 
					
						
							|  |  |  |     errors: List[BatchProcessFilesResult] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  | @router.post("/process/files/batch") | 
					
						
							|  |  |  | def process_files_batch( | 
					
						
							| 
									
										
										
										
											2024-12-31 00:36:34 +08:00
										 |  |  |     request: Request, | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  |     form_data: BatchProcessFilesForm, | 
					
						
							|  |  |  |     user=Depends(get_verified_user), | 
					
						
							|  |  |  | ) -> BatchProcessFilesResponse: | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Process a batch of files and save them to the vector database. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     results: List[BatchProcessFilesResult] = [] | 
					
						
							|  |  |  |     errors: List[BatchProcessFilesResult] = [] | 
					
						
							|  |  |  |     collection_name = form_data.collection_name | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Prepare all documents first | 
					
						
							|  |  |  |     all_docs: List[Document] = [] | 
					
						
							|  |  |  |     for file in form_data.files: | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             text_content = file.data.get("content", "") | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  |             docs: List[Document] = [ | 
					
						
							|  |  |  |                 Document( | 
					
						
							|  |  |  |                     page_content=text_content.replace("<br/>", "\n"), | 
					
						
							|  |  |  |                     metadata={ | 
					
						
							|  |  |  |                         **file.meta, | 
					
						
							|  |  |  |                         "name": file.filename, | 
					
						
							|  |  |  |                         "created_by": file.user_id, | 
					
						
							|  |  |  |                         "file_id": file.id, | 
					
						
							|  |  |  |                         "source": file.filename, | 
					
						
							|  |  |  |                     }, | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             hash = calculate_sha256_string(text_content) | 
					
						
							|  |  |  |             Files.update_file_hash_by_id(file.id, hash) | 
					
						
							|  |  |  |             Files.update_file_data_by_id(file.id, {"content": text_content}) | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  |             all_docs.extend(docs) | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  |             results.append(BatchProcessFilesResult(file_id=file.id, status="prepared")) | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         except Exception as e: | 
					
						
							|  |  |  |             log.error(f"process_files_batch: Error processing file {file.id}: {str(e)}") | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  |             errors.append( | 
					
						
							|  |  |  |                 BatchProcessFilesResult(file_id=file.id, status="failed", error=str(e)) | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Save all documents in one batch | 
					
						
							|  |  |  |     if all_docs: | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             save_docs_to_vector_db( | 
					
						
							| 
									
										
										
										
											2024-12-31 00:36:34 +08:00
										 |  |  |                 request=request, | 
					
						
							|  |  |  |                 docs=all_docs, | 
					
						
							|  |  |  |                 collection_name=collection_name, | 
					
						
							|  |  |  |                 add=True, | 
					
						
							| 
									
										
										
										
											2025-01-29 18:55:52 +08:00
										 |  |  |                 user=user, | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  |             # Update all files with collection name | 
					
						
							|  |  |  |             for result in results: | 
					
						
							|  |  |  |                 Files.update_file_metadata_by_id( | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  |                     result.file_id, {"collection_name": collection_name} | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  |                 ) | 
					
						
							|  |  |  |                 result.status = "completed" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  |             log.error( | 
					
						
							|  |  |  |                 f"process_files_batch: Error saving documents to vector DB: {str(e)}" | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-12-16 14:34:06 +08:00
										 |  |  |             for result in results: | 
					
						
							|  |  |  |                 result.status = "failed" | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  |                 errors.append( | 
					
						
							|  |  |  |                     BatchProcessFilesResult(file_id=result.file_id, error=str(e)) | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-12-13 22:29:43 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-12-18 10:40:50 +08:00
										 |  |  |     return BatchProcessFilesResponse(results=results, errors=errors) |