| 
									
										
										
										
											2024-01-07 15:40:51 +08:00
										 |  |  | from fastapi import ( | 
					
						
							|  |  |  |     FastAPI, | 
					
						
							|  |  |  |     Depends, | 
					
						
							|  |  |  |     HTTPException, | 
					
						
							|  |  |  |     status, | 
					
						
							|  |  |  |     UploadFile, | 
					
						
							|  |  |  |     File, | 
					
						
							|  |  |  |     Form, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-01-07 14:07:20 +08:00
										 |  |  | from fastapi.middleware.cors import CORSMiddleware | 
					
						
							| 
									
										
										
										
											2024-06-12 09:15:04 +08:00
										 |  |  | import requests | 
					
						
							| 
									
										
										
										
											2024-04-03 23:19:18 +08:00
										 |  |  | import os, shutil, logging, re | 
					
						
							| 
									
										
										
										
											2024-06-08 12:18:04 +08:00
										 |  |  | from datetime import datetime | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | from pathlib import Path | 
					
						
							| 
									
										
										
										
											2024-06-11 23:06:14 +08:00
										 |  |  | from typing import List, Union, Sequence, Iterator, Any | 
					
						
							| 
									
										
										
										
											2024-01-07 14:07:20 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-09 22:38:40 +08:00
										 |  |  | from chromadb.utils.batch_utils import create_batches | 
					
						
							| 
									
										
										
										
											2024-06-11 23:06:14 +08:00
										 |  |  | from langchain_core.documents import Document | 
					
						
							| 
									
										
										
										
											2024-01-07 14:07:20 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-08 01:05:52 +08:00
										 |  |  | from langchain_community.document_loaders import ( | 
					
						
							|  |  |  |     WebBaseLoader, | 
					
						
							|  |  |  |     TextLoader, | 
					
						
							|  |  |  |     PyPDFLoader, | 
					
						
							|  |  |  |     CSVLoader, | 
					
						
							| 
									
										
										
										
											2024-03-25 18:26:18 +08:00
										 |  |  |     BSHTMLLoader, | 
					
						
							| 
									
										
										
										
											2024-01-08 05:56:01 +08:00
										 |  |  |     Docx2txtLoader, | 
					
						
							| 
									
										
										
										
											2024-01-13 21:46:56 +08:00
										 |  |  |     UnstructuredEPubLoader, | 
					
						
							| 
									
										
										
										
											2024-01-10 07:24:53 +08:00
										 |  |  |     UnstructuredWordDocumentLoader, | 
					
						
							|  |  |  |     UnstructuredMarkdownLoader, | 
					
						
							| 
									
										
										
											
												feat: Add RAG support for various programming languages
Enables RAG for golang, python, java, sh, bat, powershell, cmd, js, css, c/c++/c#, sql, logs, ini, perl, r, dart, docker, env, php, haskell, lua, conf, plsql, ruby, db2, scalla, bash, swift, vue, html, xml, and other arbitrary text files.
											
										 
											2024-01-17 15:09:47 +08:00
										 |  |  |     UnstructuredXMLLoader, | 
					
						
							| 
									
										
										
										
											2024-01-20 01:48:04 +08:00
										 |  |  |     UnstructuredRSTLoader, | 
					
						
							| 
									
										
										
										
											2024-01-24 05:03:22 +08:00
										 |  |  |     UnstructuredExcelLoader, | 
					
						
							| 
									
										
										
										
											2024-05-20 22:22:43 +08:00
										 |  |  |     UnstructuredPowerPointLoader, | 
					
						
							| 
									
										
										
										
											2024-05-02 08:17:00 +08:00
										 |  |  |     YoutubeLoader, | 
					
						
							| 
									
										
										
										
											2024-06-08 12:18:04 +08:00
										 |  |  |     OutlookMessageLoader, | 
					
						
							| 
									
										
										
										
											2024-01-08 01:05:52 +08:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | from langchain.text_splitter import RecursiveCharacterTextSplitter | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-30 03:55:17 +08:00
										 |  |  | import validators | 
					
						
							|  |  |  | import urllib.parse | 
					
						
							|  |  |  | import socket | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | from pydantic import BaseModel | 
					
						
							|  |  |  | from typing import Optional | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | import mimetypes | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | import uuid | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  | import json | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-23 02:27:43 +08:00
										 |  |  | import sentence_transformers | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-26 16:17:57 +08:00
										 |  |  | from apps.webui.models.documents import ( | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  |     Documents, | 
					
						
							|  |  |  |     DocumentForm, | 
					
						
							|  |  |  |     DocumentResponse, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-02-18 16:17:43 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  | from apps.rag.utils import ( | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     get_model_path, | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |     get_embedding_function, | 
					
						
							|  |  |  |     query_doc, | 
					
						
							|  |  |  |     query_doc_with_hybrid_search, | 
					
						
							|  |  |  |     query_collection, | 
					
						
							|  |  |  |     query_collection_with_hybrid_search, | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-03-09 11:26:39 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  | from apps.rag.search.brave import search_brave | 
					
						
							|  |  |  | from apps.rag.search.google_pse import search_google_pse | 
					
						
							|  |  |  | from apps.rag.search.main import SearchResult | 
					
						
							|  |  |  | from apps.rag.search.searxng import search_searxng | 
					
						
							|  |  |  | from apps.rag.search.serper import search_serper | 
					
						
							|  |  |  | from apps.rag.search.serpstack import search_serpstack | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  | from apps.rag.search.serply import search_serply | 
					
						
							| 
									
										
										
										
											2024-06-11 22:19:08 +08:00
										 |  |  | from apps.rag.search.duckduckgo import search_duckduckgo | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | from utils.misc import ( | 
					
						
							|  |  |  |     calculate_sha256, | 
					
						
							|  |  |  |     calculate_sha256_string, | 
					
						
							|  |  |  |     sanitize_filename, | 
					
						
							|  |  |  |     extract_folders_after_data_docs, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-02-09 08:05:01 +08:00
										 |  |  | from utils.utils import get_current_user, get_admin_user | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | from config import ( | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |     AppConfig, | 
					
						
							| 
									
										
										
										
											2024-05-19 21:51:32 +08:00
										 |  |  |     ENV, | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  |     SRC_LOG_LEVELS, | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  |     UPLOAD_DIR, | 
					
						
							|  |  |  |     DOCS_DIR, | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  |     RAG_TOP_K, | 
					
						
							|  |  |  |     RAG_RELEVANCE_THRESHOLD, | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  |     RAG_EMBEDDING_ENGINE, | 
					
						
							| 
									
										
										
										
											2024-02-19 03:16:10 +08:00
										 |  |  |     RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     RAG_EMBEDDING_MODEL_AUTO_UPDATE, | 
					
						
							| 
									
										
										
										
											2024-04-23 02:27:43 +08:00
										 |  |  |     RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE, | 
					
						
							| 
									
										
										
										
											2024-04-27 02:41:39 +08:00
										 |  |  |     ENABLE_RAG_HYBRID_SEARCH, | 
					
						
							| 
									
										
										
										
											2024-05-07 04:12:08 +08:00
										 |  |  |     ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  |     RAG_RERANKING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-28 06:54:26 +08:00
										 |  |  |     PDF_EXTRACT_IMAGES, | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     RAG_RERANKING_MODEL_AUTO_UPDATE, | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  |     RAG_RERANKING_MODEL_TRUST_REMOTE_CODE, | 
					
						
							| 
									
										
										
										
											2024-04-21 04:15:59 +08:00
										 |  |  |     RAG_OPENAI_API_BASE_URL, | 
					
						
							|  |  |  |     RAG_OPENAI_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-03-20 15:44:09 +08:00
										 |  |  |     DEVICE_TYPE, | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  |     CHROMA_CLIENT, | 
					
						
							|  |  |  |     CHUNK_SIZE, | 
					
						
							|  |  |  |     CHUNK_OVERLAP, | 
					
						
							| 
									
										
										
										
											2024-02-18 14:41:03 +08:00
										 |  |  |     RAG_TEMPLATE, | 
					
						
							| 
									
										
										
										
											2024-05-07 04:12:08 +08:00
										 |  |  |     ENABLE_RAG_LOCAL_WEB_FETCH, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:51:29 +08:00
										 |  |  |     YOUTUBE_LOADER_LANGUAGE, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     ENABLE_RAG_WEB_SEARCH, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     RAG_WEB_SEARCH_ENGINE, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     SEARXNG_QUERY_URL, | 
					
						
							|  |  |  |     GOOGLE_PSE_API_KEY, | 
					
						
							|  |  |  |     GOOGLE_PSE_ENGINE_ID, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     BRAVE_SEARCH_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     SERPSTACK_API_KEY, | 
					
						
							|  |  |  |     SERPSTACK_HTTPS, | 
					
						
							|  |  |  |     SERPER_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |     SERPLY_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							| 
									
										
										
										
											2024-05-11 23:12:52 +08:00
										 |  |  |     RAG_WEB_SEARCH_CONCURRENT_REQUESTS, | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |     RAG_EMBEDDING_OPENAI_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-02-18 16:20:54 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | from constants import ERROR_MESSAGES | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  | log = logging.getLogger(__name__) | 
					
						
							|  |  |  | log.setLevel(SRC_LOG_LEVELS["RAG"]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 14:07:20 +08:00
										 |  |  | app = FastAPI() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  | app.state.config = AppConfig() | 
					
						
							| 
									
										
										
										
											2024-04-27 02:41:39 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  | app.state.config.TOP_K = RAG_TOP_K | 
					
						
							|  |  |  | app.state.config.RELEVANCE_THRESHOLD = RAG_RELEVANCE_THRESHOLD | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH | 
					
						
							|  |  |  | app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = ( | 
					
						
							| 
									
										
										
										
											2024-05-07 04:12:08 +08:00
										 |  |  |     ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-04-26 06:31:21 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  | app.state.config.CHUNK_SIZE = CHUNK_SIZE | 
					
						
							|  |  |  | app.state.config.CHUNK_OVERLAP = CHUNK_OVERLAP | 
					
						
							| 
									
										
										
										
											2024-04-10 15:33:45 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  | app.state.config.RAG_EMBEDDING_ENGINE = RAG_EMBEDDING_ENGINE | 
					
						
							|  |  |  | app.state.config.RAG_EMBEDDING_MODEL = RAG_EMBEDDING_MODEL | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  | app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE = RAG_EMBEDDING_OPENAI_BATCH_SIZE | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  | app.state.config.RAG_RERANKING_MODEL = RAG_RERANKING_MODEL | 
					
						
							|  |  |  | app.state.config.RAG_TEMPLATE = RAG_TEMPLATE | 
					
						
							| 
									
										
										
										
											2024-04-10 15:33:45 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-07 04:12:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  | app.state.config.OPENAI_API_BASE_URL = RAG_OPENAI_API_BASE_URL | 
					
						
							|  |  |  | app.state.config.OPENAI_API_KEY = RAG_OPENAI_API_KEY | 
					
						
							| 
									
										
										
										
											2024-04-10 15:33:45 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  | app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  | app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  | app.state.YOUTUBE_LOADER_TRANSLATION = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  | app.state.config.ENABLE_RAG_WEB_SEARCH = ENABLE_RAG_WEB_SEARCH | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  | app.state.config.RAG_WEB_SEARCH_ENGINE = RAG_WEB_SEARCH_ENGINE | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  | app.state.config.SEARXNG_QUERY_URL = SEARXNG_QUERY_URL | 
					
						
							|  |  |  | app.state.config.GOOGLE_PSE_API_KEY = GOOGLE_PSE_API_KEY | 
					
						
							|  |  |  | app.state.config.GOOGLE_PSE_ENGINE_ID = GOOGLE_PSE_ENGINE_ID | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  | app.state.config.BRAVE_SEARCH_API_KEY = BRAVE_SEARCH_API_KEY | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  | app.state.config.SERPSTACK_API_KEY = SERPSTACK_API_KEY | 
					
						
							|  |  |  | app.state.config.SERPSTACK_HTTPS = SERPSTACK_HTTPS | 
					
						
							|  |  |  | app.state.config.SERPER_API_KEY = SERPER_API_KEY | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  | app.state.config.SERPLY_API_KEY = SERPLY_API_KEY | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  | app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = RAG_WEB_SEARCH_RESULT_COUNT | 
					
						
							|  |  |  | app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = RAG_WEB_SEARCH_CONCURRENT_REQUESTS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  | def update_embedding_model( | 
					
						
							|  |  |  |     embedding_model: str, | 
					
						
							|  |  |  |     update_model: bool = False, | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |     if embedding_model and app.state.config.RAG_EMBEDDING_ENGINE == "": | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |         app.state.sentence_transformer_ef = sentence_transformers.SentenceTransformer( | 
					
						
							|  |  |  |             get_model_path(embedding_model, update_model), | 
					
						
							|  |  |  |             device=DEVICE_TYPE, | 
					
						
							|  |  |  |             trust_remote_code=RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         app.state.sentence_transformer_ef = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def update_reranking_model( | 
					
						
							|  |  |  |     reranking_model: str, | 
					
						
							|  |  |  |     update_model: bool = False, | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     if reranking_model: | 
					
						
							|  |  |  |         app.state.sentence_transformer_rf = sentence_transformers.CrossEncoder( | 
					
						
							|  |  |  |             get_model_path(reranking_model, update_model), | 
					
						
							|  |  |  |             device=DEVICE_TYPE, | 
					
						
							|  |  |  |             trust_remote_code=RAG_RERANKING_MODEL_TRUST_REMOTE_CODE, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         app.state.sentence_transformer_rf = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | update_embedding_model( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |     app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     RAG_EMBEDDING_MODEL_AUTO_UPDATE, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | update_reranking_model( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |     app.state.config.RAG_RERANKING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  |     RAG_RERANKING_MODEL_AUTO_UPDATE, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | app.state.EMBEDDING_FUNCTION = get_embedding_function( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |     app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |     app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |     app.state.sentence_transformer_ef, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |     app.state.config.OPENAI_API_KEY, | 
					
						
							|  |  |  |     app.state.config.OPENAI_API_BASE_URL, | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |     app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 14:07:20 +08:00
										 |  |  | origins = ["*"] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-25 20:49:59 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 14:07:20 +08:00
										 |  |  | app.add_middleware( | 
					
						
							|  |  |  |     CORSMiddleware, | 
					
						
							|  |  |  |     allow_origins=origins, | 
					
						
							|  |  |  |     allow_credentials=True, | 
					
						
							|  |  |  |     allow_methods=["*"], | 
					
						
							|  |  |  |     allow_headers=["*"], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 15:40:51 +08:00
										 |  |  | class CollectionNameForm(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |     collection_name: Optional[str] = "test" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-02 08:17:00 +08:00
										 |  |  | class UrlForm(CollectionNameForm): | 
					
						
							| 
									
										
										
										
											2024-01-07 15:40:51 +08:00
										 |  |  |     url: str | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  | class SearchForm(CollectionNameForm): | 
					
						
							|  |  |  |     query: str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 14:07:20 +08:00
										 |  |  | @app.get("/") | 
					
						
							|  |  |  | async def get_status(): | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         "chunk_size": app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |         "chunk_overlap": app.state.config.CHUNK_OVERLAP, | 
					
						
							|  |  |  |         "template": app.state.config.RAG_TEMPLATE, | 
					
						
							|  |  |  |         "embedding_engine": app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |         "embedding_model": app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							|  |  |  |         "reranking_model": app.state.config.RAG_RERANKING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |         "openai_batch_size": app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 06:31:40 +08:00
										 |  |  | @app.get("/embedding") | 
					
						
							|  |  |  | async def get_embedding_config(user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         "embedding_engine": app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |         "embedding_model": app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |         "openai_config": { | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             "url": app.state.config.OPENAI_API_BASE_URL, | 
					
						
							|  |  |  |             "key": app.state.config.OPENAI_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |             "batch_size": app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | @app.get("/reranking") | 
					
						
							|  |  |  | async def get_reraanking_config(user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         "reranking_model": app.state.config.RAG_RERANKING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  | class OpenAIConfigForm(BaseModel): | 
					
						
							|  |  |  |     url: str | 
					
						
							|  |  |  |     key: str | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |     batch_size: Optional[int] = None | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  | class EmbeddingModelUpdateForm(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |     openai_config: Optional[OpenAIConfigForm] = None | 
					
						
							| 
									
										
										
										
											2024-04-15 06:31:40 +08:00
										 |  |  |     embedding_engine: str | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     embedding_model: str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 06:31:40 +08:00
										 |  |  | @app.post("/embedding/update") | 
					
						
							|  |  |  | async def update_embedding_config( | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user) | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-04-05 02:07:42 +08:00
										 |  |  |     log.info( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         f"Updating embedding model: {app.state.config.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}" | 
					
						
							| 
									
										
										
										
											2024-02-20 03:05:45 +08:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-04-05 01:01:23 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine | 
					
						
							|  |  |  |         app.state.config.RAG_EMBEDDING_MODEL = form_data.embedding_model | 
					
						
							| 
									
										
										
										
											2024-04-15 06:31:40 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         if app.state.config.RAG_EMBEDDING_ENGINE in ["ollama", "openai"]: | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |             if form_data.openai_config is not None: | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |                 app.state.config.OPENAI_API_BASE_URL = form_data.openai_config.url | 
					
						
							|  |  |  |                 app.state.config.OPENAI_API_KEY = form_data.openai_config.key | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |                 app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE = ( | 
					
						
							|  |  |  |                     form_data.openai_config.batch_size | 
					
						
							|  |  |  |                     if form_data.openai_config.batch_size | 
					
						
							|  |  |  |                     else 1 | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 21:51:32 +08:00
										 |  |  |         update_embedding_model(app.state.config.RAG_EMBEDDING_MODEL) | 
					
						
							| 
									
										
										
										
											2024-04-05 01:01:23 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |         app.state.EMBEDDING_FUNCTION = get_embedding_function( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |             app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |             app.state.sentence_transformer_ef, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             app.state.config.OPENAI_API_KEY, | 
					
						
							|  |  |  |             app.state.config.OPENAI_API_BASE_URL, | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |             app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-10 15:59:05 +08:00
										 |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             "embedding_engine": app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |             "embedding_model": app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |             "openai_config": { | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |                 "url": app.state.config.OPENAI_API_BASE_URL, | 
					
						
							|  |  |  |                 "key": app.state.config.OPENAI_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |                 "batch_size": app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-04-10 15:59:05 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(f"Problem updating embedding model: {e}") | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | class RerankingModelUpdateForm(BaseModel): | 
					
						
							|  |  |  |     reranking_model: str | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | @app.post("/reranking/update") | 
					
						
							|  |  |  | async def update_reranking_config( | 
					
						
							|  |  |  |     form_data: RerankingModelUpdateForm, user=Depends(get_admin_user) | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     log.info( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         f"Updating reranking model: {app.state.config.RAG_RERANKING_MODEL} to {form_data.reranking_model}" | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         app.state.config.RAG_RERANKING_MODEL = form_data.reranking_model | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         update_reranking_model(app.state.config.RAG_RERANKING_MODEL), True | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             "reranking_model": app.state.config.RAG_RERANKING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  |         } | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(f"Problem updating reranking model: {e}") | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  | @app.get("/config") | 
					
						
							|  |  |  | async def get_rag_config(user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES, | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  |         "chunk": { | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             "chunk_size": app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |             "chunk_overlap": app.state.config.CHUNK_OVERLAP, | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |         "youtube": { | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             "language": app.state.config.YOUTUBE_LOADER_LANGUAGE, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |             "translation": app.state.YOUTUBE_LOADER_TRANSLATION, | 
					
						
							|  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |         "web": { | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |             "ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |             "search": { | 
					
						
							| 
									
										
										
										
											2024-06-02 11:08:08 +08:00
										 |  |  |                 "enabled": app.state.config.ENABLE_RAG_WEB_SEARCH, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |                 "engine": app.state.config.RAG_WEB_SEARCH_ENGINE, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |                 "searxng_query_url": app.state.config.SEARXNG_QUERY_URL, | 
					
						
							|  |  |  |                 "google_pse_api_key": app.state.config.GOOGLE_PSE_API_KEY, | 
					
						
							|  |  |  |                 "google_pse_engine_id": app.state.config.GOOGLE_PSE_ENGINE_ID, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |                 "brave_search_api_key": app.state.config.BRAVE_SEARCH_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |                 "serpstack_api_key": app.state.config.SERPSTACK_API_KEY, | 
					
						
							|  |  |  |                 "serpstack_https": app.state.config.SERPSTACK_HTTPS, | 
					
						
							|  |  |  |                 "serper_api_key": app.state.config.SERPER_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |                 "serply_api_key": app.state.config.SERPLY_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |                 "result_count": app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 "concurrent_requests": app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class ChunkParamUpdateForm(BaseModel): | 
					
						
							|  |  |  |     chunk_size: int | 
					
						
							|  |  |  |     chunk_overlap: int | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  | class YoutubeLoaderConfig(BaseModel): | 
					
						
							|  |  |  |     language: List[str] | 
					
						
							|  |  |  |     translation: Optional[str] = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  | class WebSearchConfig(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-06-02 11:08:08 +08:00
										 |  |  |     enabled: bool | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     engine: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     searxng_query_url: Optional[str] = None | 
					
						
							|  |  |  |     google_pse_api_key: Optional[str] = None | 
					
						
							|  |  |  |     google_pse_engine_id: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     brave_search_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     serpstack_api_key: Optional[str] = None | 
					
						
							|  |  |  |     serpstack_https: Optional[bool] = None | 
					
						
							|  |  |  |     serper_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |     serply_api_key: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:03:56 +08:00
										 |  |  |     result_count: Optional[int] = None | 
					
						
							|  |  |  |     concurrent_requests: Optional[int] = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  | class WebConfig(BaseModel): | 
					
						
							|  |  |  |     search: WebSearchConfig | 
					
						
							|  |  |  |     web_loader_ssl_verification: Optional[bool] = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  | class ConfigUpdateForm(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  |     pdf_extract_images: Optional[bool] = None | 
					
						
							|  |  |  |     chunk: Optional[ChunkParamUpdateForm] = None | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |     youtube: Optional[YoutubeLoaderConfig] = None | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     web: Optional[WebConfig] = None | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @app.post("/config/update") | 
					
						
							|  |  |  | async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |     app.state.config.PDF_EXTRACT_IMAGES = ( | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  |         form_data.pdf_extract_images | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         if form_data.pdf_extract_images is not None | 
					
						
							|  |  |  |         else app.state.config.PDF_EXTRACT_IMAGES | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     if form_data.chunk is not None: | 
					
						
							|  |  |  |         app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size | 
					
						
							|  |  |  |         app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     if form_data.youtube is not None: | 
					
						
							|  |  |  |         app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language | 
					
						
							|  |  |  |         app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |     if form_data.web is not None: | 
					
						
							|  |  |  |         app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = ( | 
					
						
							|  |  |  |             form_data.web.web_loader_ssl_verification | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 11:08:08 +08:00
										 |  |  |         app.state.config.ENABLE_RAG_WEB_SEARCH = form_data.web.search.enabled | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |         app.state.config.RAG_WEB_SEARCH_ENGINE = form_data.web.search.engine | 
					
						
							|  |  |  |         app.state.config.SEARXNG_QUERY_URL = form_data.web.search.searxng_query_url | 
					
						
							|  |  |  |         app.state.config.GOOGLE_PSE_API_KEY = form_data.web.search.google_pse_api_key | 
					
						
							|  |  |  |         app.state.config.GOOGLE_PSE_ENGINE_ID = ( | 
					
						
							|  |  |  |             form_data.web.search.google_pse_engine_id | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         app.state.config.BRAVE_SEARCH_API_KEY = ( | 
					
						
							|  |  |  |             form_data.web.search.brave_search_api_key | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         app.state.config.SERPSTACK_API_KEY = form_data.web.search.serpstack_api_key | 
					
						
							|  |  |  |         app.state.config.SERPSTACK_HTTPS = form_data.web.search.serpstack_https | 
					
						
							|  |  |  |         app.state.config.SERPER_API_KEY = form_data.web.search.serper_api_key | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |         app.state.config.SERPLY_API_KEY = form_data.web.search.serply_api_key | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |         app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = form_data.web.search.result_count | 
					
						
							|  |  |  |         app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = ( | 
					
						
							|  |  |  |             form_data.web.search.concurrent_requests | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES, | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  |         "chunk": { | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             "chunk_size": app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |             "chunk_overlap": app.state.config.CHUNK_OVERLAP, | 
					
						
							| 
									
										
										
										
											2024-03-11 04:32:34 +08:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |         "youtube": { | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             "language": app.state.config.YOUTUBE_LOADER_LANGUAGE, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |             "translation": app.state.YOUTUBE_LOADER_TRANSLATION, | 
					
						
							|  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |         "web": { | 
					
						
							|  |  |  |             "ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, | 
					
						
							|  |  |  |             "search": { | 
					
						
							| 
									
										
										
										
											2024-06-02 11:08:08 +08:00
										 |  |  |                 "enabled": app.state.config.ENABLE_RAG_WEB_SEARCH, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |                 "engine": app.state.config.RAG_WEB_SEARCH_ENGINE, | 
					
						
							|  |  |  |                 "searxng_query_url": app.state.config.SEARXNG_QUERY_URL, | 
					
						
							|  |  |  |                 "google_pse_api_key": app.state.config.GOOGLE_PSE_API_KEY, | 
					
						
							|  |  |  |                 "google_pse_engine_id": app.state.config.GOOGLE_PSE_ENGINE_ID, | 
					
						
							|  |  |  |                 "brave_search_api_key": app.state.config.BRAVE_SEARCH_API_KEY, | 
					
						
							|  |  |  |                 "serpstack_api_key": app.state.config.SERPSTACK_API_KEY, | 
					
						
							|  |  |  |                 "serpstack_https": app.state.config.SERPSTACK_HTTPS, | 
					
						
							|  |  |  |                 "serper_api_key": app.state.config.SERPER_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |                 "serply_api_key": app.state.config.SERPLY_API_KEY, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:40:48 +08:00
										 |  |  |                 "result_count": app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |                 "concurrent_requests": app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, | 
					
						
							|  |  |  |             }, | 
					
						
							|  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2024-02-18 14:29:52 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 14:41:03 +08:00
										 |  |  | @app.get("/template") | 
					
						
							|  |  |  | async def get_rag_template(user=Depends(get_current_user)): | 
					
						
							|  |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         "template": app.state.config.RAG_TEMPLATE, | 
					
						
							| 
									
										
										
										
											2024-02-18 14:41:03 +08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  | @app.get("/query/settings") | 
					
						
							|  |  |  | async def get_query_settings(user=Depends(get_admin_user)): | 
					
						
							|  |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         "template": app.state.config.RAG_TEMPLATE, | 
					
						
							|  |  |  |         "k": app.state.config.TOP_K, | 
					
						
							|  |  |  |         "r": app.state.config.RELEVANCE_THRESHOLD, | 
					
						
							|  |  |  |         "hybrid": app.state.config.ENABLE_RAG_HYBRID_SEARCH, | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-02-18 14:41:03 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  | class QuerySettingsForm(BaseModel): | 
					
						
							|  |  |  |     k: Optional[int] = None | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  |     r: Optional[float] = None | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  |     template: Optional[str] = None | 
					
						
							| 
									
										
										
										
											2024-04-26 06:31:21 +08:00
										 |  |  |     hybrid: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @app.post("/query/settings/update") | 
					
						
							|  |  |  | async def update_query_settings( | 
					
						
							|  |  |  |     form_data: QuerySettingsForm, user=Depends(get_admin_user) | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |     app.state.config.RAG_TEMPLATE = ( | 
					
						
							| 
									
										
										
										
											2024-05-18 10:53:38 +08:00
										 |  |  |         form_data.template if form_data.template else RAG_TEMPLATE | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |     app.state.config.TOP_K = form_data.k if form_data.k else 4 | 
					
						
							|  |  |  |     app.state.config.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0 | 
					
						
							|  |  |  |     app.state.config.ENABLE_RAG_HYBRID_SEARCH = ( | 
					
						
							| 
									
										
										
										
											2024-05-18 10:53:38 +08:00
										 |  |  |         form_data.hybrid if form_data.hybrid else False | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-04-26 06:31:21 +08:00
										 |  |  |     return { | 
					
						
							|  |  |  |         "status": True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         "template": app.state.config.RAG_TEMPLATE, | 
					
						
							|  |  |  |         "k": app.state.config.TOP_K, | 
					
						
							|  |  |  |         "r": app.state.config.RELEVANCE_THRESHOLD, | 
					
						
							|  |  |  |         "hybrid": app.state.config.ENABLE_RAG_HYBRID_SEARCH, | 
					
						
							| 
									
										
										
										
											2024-04-26 06:31:21 +08:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-04 07:57:06 +08:00
										 |  |  | class QueryDocForm(BaseModel): | 
					
						
							| 
									
										
										
										
											2024-02-02 05:35:41 +08:00
										 |  |  |     collection_name: str | 
					
						
							|  |  |  |     query: str | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  |     k: Optional[int] = None | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  |     r: Optional[float] = None | 
					
						
							| 
									
										
										
										
											2024-04-26 06:31:21 +08:00
										 |  |  |     hybrid: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-02-02 05:35:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-04 07:57:06 +08:00
										 |  |  | @app.post("/query/doc") | 
					
						
							| 
									
										
										
										
											2024-03-09 11:26:39 +08:00
										 |  |  | def query_doc_handler( | 
					
						
							| 
									
										
										
										
											2024-02-04 07:57:06 +08:00
										 |  |  |     form_data: QueryDocForm, | 
					
						
							| 
									
										
										
										
											2024-01-07 18:46:12 +08:00
										 |  |  |     user=Depends(get_current_user), | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-01-07 17:59:00 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         if app.state.config.ENABLE_RAG_HYBRID_SEARCH: | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |             return query_doc_with_hybrid_search( | 
					
						
							|  |  |  |                 collection_name=form_data.collection_name, | 
					
						
							|  |  |  |                 query=form_data.query, | 
					
						
							| 
									
										
										
										
											2024-04-30 01:15:58 +08:00
										 |  |  |                 embedding_function=app.state.EMBEDDING_FUNCTION, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |                 k=form_data.k if form_data.k else app.state.config.TOP_K, | 
					
						
							| 
									
										
										
										
											2024-04-30 01:15:58 +08:00
										 |  |  |                 reranking_function=app.state.sentence_transformer_rf, | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |                 r=( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |                     form_data.r if form_data.r else app.state.config.RELEVANCE_THRESHOLD | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |                 ), | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             return query_doc( | 
					
						
							|  |  |  |                 collection_name=form_data.collection_name, | 
					
						
							|  |  |  |                 query=form_data.query, | 
					
						
							| 
									
										
										
										
											2024-04-30 01:15:58 +08:00
										 |  |  |                 embedding_function=app.state.EMBEDDING_FUNCTION, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |                 k=form_data.k if form_data.k else app.state.config.TOP_K, | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-01-07 17:59:00 +08:00
										 |  |  |     except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  |         log.exception(e) | 
					
						
							| 
									
										
										
										
											2024-01-07 17:59:00 +08:00
										 |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-02 05:35:41 +08:00
										 |  |  | class QueryCollectionsForm(BaseModel): | 
					
						
							|  |  |  |     collection_names: List[str] | 
					
						
							|  |  |  |     query: str | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  |     k: Optional[int] = None | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  |     r: Optional[float] = None | 
					
						
							| 
									
										
										
										
											2024-04-26 06:31:21 +08:00
										 |  |  |     hybrid: Optional[bool] = None | 
					
						
							| 
									
										
										
										
											2024-02-02 05:35:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-04 07:57:06 +08:00
										 |  |  | @app.post("/query/collection") | 
					
						
							| 
									
										
										
										
											2024-03-09 11:26:39 +08:00
										 |  |  | def query_collection_handler( | 
					
						
							| 
									
										
										
										
											2024-02-02 05:35:41 +08:00
										 |  |  |     form_data: QueryCollectionsForm, | 
					
						
							|  |  |  |     user=Depends(get_current_user), | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         if app.state.config.ENABLE_RAG_HYBRID_SEARCH: | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |             return query_collection_with_hybrid_search( | 
					
						
							|  |  |  |                 collection_names=form_data.collection_names, | 
					
						
							|  |  |  |                 query=form_data.query, | 
					
						
							| 
									
										
										
										
											2024-04-30 01:15:58 +08:00
										 |  |  |                 embedding_function=app.state.EMBEDDING_FUNCTION, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |                 k=form_data.k if form_data.k else app.state.config.TOP_K, | 
					
						
							| 
									
										
										
										
											2024-04-30 01:15:58 +08:00
										 |  |  |                 reranking_function=app.state.sentence_transformer_rf, | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |                 r=( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |                     form_data.r if form_data.r else app.state.config.RELEVANCE_THRESHOLD | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |                 ), | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             return query_collection( | 
					
						
							|  |  |  |                 collection_names=form_data.collection_names, | 
					
						
							|  |  |  |                 query=form_data.query, | 
					
						
							| 
									
										
										
										
											2024-04-30 01:15:58 +08:00
										 |  |  |                 embedding_function=app.state.EMBEDDING_FUNCTION, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |                 k=form_data.k if form_data.k else app.state.config.TOP_K, | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-02-02 05:35:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-02 08:17:00 +08:00
										 |  |  | @app.post("/youtube") | 
					
						
							|  |  |  | def store_youtube_video(form_data: UrlForm, user=Depends(get_current_user)): | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |         loader = YoutubeLoader.from_youtube_url( | 
					
						
							|  |  |  |             form_data.url, | 
					
						
							|  |  |  |             add_video_info=True, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             language=app.state.config.YOUTUBE_LOADER_LANGUAGE, | 
					
						
							| 
									
										
										
										
											2024-05-09 01:47:05 +08:00
										 |  |  |             translation=app.state.YOUTUBE_LOADER_TRANSLATION, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-05-02 08:17:00 +08:00
										 |  |  |         data = loader.load() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         collection_name = form_data.collection_name | 
					
						
							|  |  |  |         if collection_name == "": | 
					
						
							|  |  |  |             collection_name = calculate_sha256_string(form_data.url)[:63] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         store_data_in_vector_db(data, collection_name, overwrite=True) | 
					
						
							|  |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							|  |  |  |             "collection_name": collection_name, | 
					
						
							|  |  |  |             "filename": form_data.url, | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | @app.post("/web") | 
					
						
							| 
									
										
										
										
											2024-05-02 08:17:00 +08:00
										 |  |  | def store_web(form_data: UrlForm, user=Depends(get_current_user)): | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  |         loader = get_web_loader( | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |             form_data.url, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             verify_ssl=app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, | 
					
						
							| 
									
										
										
										
											2024-05-07 05:50:55 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |         data = loader.load() | 
					
						
							| 
									
										
										
										
											2024-01-27 14:17:28 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         collection_name = form_data.collection_name | 
					
						
							|  |  |  |         if collection_name == "": | 
					
						
							|  |  |  |             collection_name = calculate_sha256_string(form_data.url)[:63] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-05 00:00:06 +08:00
										 |  |  |         store_data_in_vector_db(data, collection_name, overwrite=True) | 
					
						
							| 
									
										
										
										
											2024-01-08 17:26:15 +08:00
										 |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							| 
									
										
										
										
											2024-01-27 14:17:28 +08:00
										 |  |  |             "collection_name": collection_name, | 
					
						
							| 
									
										
										
										
											2024-01-08 17:26:15 +08:00
										 |  |  |             "filename": form_data.url, | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |     except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  |         log.exception(e) | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-11 23:12:52 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-07 07:26:44 +08:00
										 |  |  | def get_web_loader(url: Union[str, Sequence[str]], verify_ssl: bool = True): | 
					
						
							| 
									
										
										
										
											2024-04-30 03:55:17 +08:00
										 |  |  |     # Check if the URL is valid | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  |     if not validate_url(url): | 
					
						
							| 
									
										
										
										
											2024-04-30 03:55:17 +08:00
										 |  |  |         raise ValueError(ERROR_MESSAGES.INVALID_URL) | 
					
						
							| 
									
										
										
										
											2024-06-11 23:06:14 +08:00
										 |  |  |     return SafeWebBaseLoader( | 
					
						
							| 
									
										
										
										
											2024-05-11 23:12:52 +08:00
										 |  |  |         url, | 
					
						
							|  |  |  |         verify_ssl=verify_ssl, | 
					
						
							|  |  |  |         requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS, | 
					
						
							| 
									
										
										
										
											2024-05-12 15:19:07 +08:00
										 |  |  |         continue_on_failure=True, | 
					
						
							| 
									
										
										
										
											2024-05-11 23:12:52 +08:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-04-30 03:55:17 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  | def validate_url(url: Union[str, Sequence[str]]): | 
					
						
							|  |  |  |     if isinstance(url, str): | 
					
						
							|  |  |  |         if isinstance(validators.url(url), validators.ValidationError): | 
					
						
							|  |  |  |             raise ValueError(ERROR_MESSAGES.INVALID_URL) | 
					
						
							| 
									
										
										
										
											2024-05-11 23:12:52 +08:00
										 |  |  |         if not ENABLE_RAG_LOCAL_WEB_FETCH: | 
					
						
							| 
									
										
										
										
											2024-06-13 02:08:05 +08:00
										 |  |  |             # Local web fetch is disabled, filter out any URLs that resolve to private IP addresses | 
					
						
							|  |  |  |             parsed_url = urllib.parse.urlparse(url) | 
					
						
							|  |  |  |             # Get IPv4 and IPv6 addresses | 
					
						
							|  |  |  |             ipv4_addresses, ipv6_addresses = resolve_hostname(parsed_url.hostname) | 
					
						
							|  |  |  |             # Check if any of the resolved addresses are private | 
					
						
							|  |  |  |             # This is technically still vulnerable to DNS rebinding attacks, as we don't control WebBaseLoader | 
					
						
							|  |  |  |             for ip in ipv4_addresses: | 
					
						
							|  |  |  |                 if validators.ipv4(ip, private=True): | 
					
						
							|  |  |  |                     raise ValueError(ERROR_MESSAGES.INVALID_URL) | 
					
						
							|  |  |  |             for ip in ipv6_addresses: | 
					
						
							|  |  |  |                 if validators.ipv6(ip, private=True): | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  |                     raise ValueError(ERROR_MESSAGES.INVALID_URL) | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  |     elif isinstance(url, Sequence): | 
					
						
							|  |  |  |         return all(validate_url(u) for u in url) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-12 16:37:53 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-13 02:08:05 +08:00
										 |  |  | def resolve_hostname(hostname): | 
					
						
							|  |  |  |     # Get address information | 
					
						
							|  |  |  |     addr_info = socket.getaddrinfo(hostname, None) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Extract IP addresses from address information | 
					
						
							|  |  |  |     ipv4_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET] | 
					
						
							|  |  |  |     ipv6_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET6] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return ipv4_addresses, ipv6_addresses | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  | def search_web(engine: str, query: str) -> list[SearchResult]: | 
					
						
							|  |  |  |     """Search the web using a search engine and return the results as a list of SearchResult objects.
 | 
					
						
							|  |  |  |     Will look for a search engine API key in environment variables in the following order: | 
					
						
							|  |  |  |     - SEARXNG_QUERY_URL | 
					
						
							|  |  |  |     - GOOGLE_PSE_API_KEY + GOOGLE_PSE_ENGINE_ID | 
					
						
							|  |  |  |     - BRAVE_SEARCH_API_KEY | 
					
						
							|  |  |  |     - SERPSTACK_API_KEY | 
					
						
							|  |  |  |     - SERPER_API_KEY | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |     - SERPLY_API_KEY | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         query (str): The query to search for | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # TODO: add playwright to search the web | 
					
						
							|  |  |  |     if engine == "searxng": | 
					
						
							|  |  |  |         if app.state.config.SEARXNG_QUERY_URL: | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |             return search_searxng( | 
					
						
							|  |  |  |                 app.state.config.SEARXNG_QUERY_URL, | 
					
						
							|  |  |  |                 query, | 
					
						
							|  |  |  |                 app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SEARXNG_QUERY_URL found in environment variables") | 
					
						
							|  |  |  |     elif engine == "google_pse": | 
					
						
							|  |  |  |         if ( | 
					
						
							|  |  |  |             app.state.config.GOOGLE_PSE_API_KEY | 
					
						
							|  |  |  |             and app.state.config.GOOGLE_PSE_ENGINE_ID | 
					
						
							|  |  |  |         ): | 
					
						
							|  |  |  |             return search_google_pse( | 
					
						
							|  |  |  |                 app.state.config.GOOGLE_PSE_API_KEY, | 
					
						
							|  |  |  |                 app.state.config.GOOGLE_PSE_ENGINE_ID, | 
					
						
							|  |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |                 app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception( | 
					
						
							|  |  |  |                 "No GOOGLE_PSE_API_KEY or GOOGLE_PSE_ENGINE_ID found in environment variables" | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |     elif engine == "brave": | 
					
						
							|  |  |  |         if app.state.config.BRAVE_SEARCH_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |             return search_brave( | 
					
						
							|  |  |  |                 app.state.config.BRAVE_SEARCH_API_KEY, | 
					
						
							|  |  |  |                 query, | 
					
						
							|  |  |  |                 app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No BRAVE_SEARCH_API_KEY found in environment variables") | 
					
						
							|  |  |  |     elif engine == "serpstack": | 
					
						
							|  |  |  |         if app.state.config.SERPSTACK_API_KEY: | 
					
						
							|  |  |  |             return search_serpstack( | 
					
						
							|  |  |  |                 app.state.config.SERPSTACK_API_KEY, | 
					
						
							|  |  |  |                 query, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |                 app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |                 https_enabled=app.state.config.SERPSTACK_HTTPS, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SERPSTACK_API_KEY found in environment variables") | 
					
						
							|  |  |  |     elif engine == "serper": | 
					
						
							|  |  |  |         if app.state.config.SERPER_API_KEY: | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  |             return search_serper( | 
					
						
							|  |  |  |                 app.state.config.SERPER_API_KEY, | 
					
						
							|  |  |  |                 query, | 
					
						
							|  |  |  |                 app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SERPER_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2024-06-10 08:44:34 +08:00
										 |  |  |     elif engine == "serply": | 
					
						
							|  |  |  |         if app.state.config.SERPLY_API_KEY: | 
					
						
							|  |  |  |             return search_serply( | 
					
						
							|  |  |  |                 app.state.config.SERPLY_API_KEY, | 
					
						
							|  |  |  |                 query, | 
					
						
							|  |  |  |                 app.state.config.RAG_WEB_SEARCH_RESULT_COUNT, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise Exception("No SERPLY_API_KEY found in environment variables") | 
					
						
							| 
									
										
										
										
											2024-06-11 22:19:08 +08:00
										 |  |  |     elif engine == "duckduckgo": | 
					
						
							|  |  |  |         return search_duckduckgo(query, app.state.config.RAG_WEB_SEARCH_RESULT_COUNT) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |     else: | 
					
						
							|  |  |  |         raise Exception("No search engine API key found in environment variables") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-28 03:48:08 +08:00
										 |  |  | @app.post("/web/search") | 
					
						
							|  |  |  | def store_web_search(form_data: SearchForm, user=Depends(get_current_user)): | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-06-12 15:18:22 +08:00
										 |  |  |         logging.info( | 
					
						
							|  |  |  |             f"trying to web search with {app.state.config.RAG_WEB_SEARCH_ENGINE, form_data.query}" | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-06-02 10:52:12 +08:00
										 |  |  |         web_results = search_web( | 
					
						
							|  |  |  |             app.state.config.RAG_WEB_SEARCH_ENGINE, form_data.query | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         print(e) | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.WEB_SEARCH_ERROR(e), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  |         urls = [result.link for result in web_results] | 
					
						
							|  |  |  |         loader = get_web_loader(urls) | 
					
						
							| 
									
										
										
										
											2024-05-28 05:25:36 +08:00
										 |  |  |         data = loader.load() | 
					
						
							| 
									
										
										
										
											2024-05-06 16:39:25 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         collection_name = form_data.collection_name | 
					
						
							|  |  |  |         if collection_name == "": | 
					
						
							|  |  |  |             collection_name = calculate_sha256_string(form_data.query)[:63] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         store_data_in_vector_db(data, collection_name, overwrite=True) | 
					
						
							|  |  |  |         return { | 
					
						
							|  |  |  |             "status": True, | 
					
						
							|  |  |  |             "collection_name": collection_name, | 
					
						
							|  |  |  |             "filenames": urls, | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         log.exception(e) | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  | def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool: | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |     text_splitter = RecursiveCharacterTextSplitter( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         chunk_size=app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |         chunk_overlap=app.state.config.CHUNK_OVERLAP, | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |         add_start_index=True, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |     docs = text_splitter.split_documents(data) | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if len(docs) > 0: | 
					
						
							| 
									
										
										
										
											2024-04-15 07:48:15 +08:00
										 |  |  |         log.info(f"store_data_in_vector_db {docs}") | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  |         return store_docs_in_vector_db(docs, collection_name, overwrite), None | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def store_text_in_vector_db( | 
					
						
							| 
									
										
										
										
											2024-03-24 15:41:41 +08:00
										 |  |  |     text, metadata, collection_name, overwrite: bool = False | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  | ) -> bool: | 
					
						
							|  |  |  |     text_splitter = RecursiveCharacterTextSplitter( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |         chunk_size=app.state.config.CHUNK_SIZE, | 
					
						
							|  |  |  |         chunk_overlap=app.state.config.CHUNK_OVERLAP, | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |         add_start_index=True, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-03-24 15:41:41 +08:00
										 |  |  |     docs = text_splitter.create_documents([text], metadatas=[metadata]) | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |     return store_docs_in_vector_db(docs, collection_name, overwrite) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 06:47:45 +08:00
										 |  |  | def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool: | 
					
						
							| 
									
										
										
										
											2024-04-15 07:48:15 +08:00
										 |  |  |     log.info(f"store_docs_in_vector_db {docs} {collection_name}") | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |     texts = [doc.page_content for doc in docs] | 
					
						
							|  |  |  |     metadatas = [doc.metadata for doc in docs] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-08 12:18:04 +08:00
										 |  |  |     # ChromaDB does not like datetime formats | 
					
						
							|  |  |  |     # for meta-data so convert them to string. | 
					
						
							|  |  |  |     for metadata in metadatas: | 
					
						
							|  |  |  |         for key, value in metadata.items(): | 
					
						
							|  |  |  |             if isinstance(value, datetime): | 
					
						
							|  |  |  |                 metadata[key] = str(value) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |     try: | 
					
						
							|  |  |  |         if overwrite: | 
					
						
							|  |  |  |             for collection in CHROMA_CLIENT.list_collections(): | 
					
						
							|  |  |  |                 if collection_name == collection.name: | 
					
						
							| 
									
										
										
										
											2024-04-01 03:17:29 +08:00
										 |  |  |                     log.info(f"deleting existing collection {collection_name}") | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |                     CHROMA_CLIENT.delete_collection(name=collection_name) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-23 02:27:43 +08:00
										 |  |  |         collection = CHROMA_CLIENT.create_collection(name=collection_name) | 
					
						
							| 
									
										
										
										
											2024-04-15 05:55:00 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-28 03:38:50 +08:00
										 |  |  |         embedding_func = get_embedding_function( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             app.state.config.RAG_EMBEDDING_ENGINE, | 
					
						
							|  |  |  |             app.state.config.RAG_EMBEDDING_MODEL, | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  |             app.state.sentence_transformer_ef, | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             app.state.config.OPENAI_API_KEY, | 
					
						
							|  |  |  |             app.state.config.OPENAI_API_BASE_URL, | 
					
						
							| 
									
										
										
										
											2024-06-02 22:34:31 +08:00
										 |  |  |             app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE, | 
					
						
							| 
									
										
										
										
											2024-04-23 04:49:58 +08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         embedding_texts = list(map(lambda x: x.replace("\n", " "), texts)) | 
					
						
							| 
									
										
										
										
											2024-04-23 07:36:46 +08:00
										 |  |  |         embeddings = embedding_func(embedding_texts) | 
					
						
							| 
									
										
										
										
											2024-04-23 02:27:43 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         for batch in create_batches( | 
					
						
							|  |  |  |             api=CHROMA_CLIENT, | 
					
						
							| 
									
										
										
										
											2024-05-07 16:42:05 +08:00
										 |  |  |             ids=[str(uuid.uuid4()) for _ in texts], | 
					
						
							| 
									
										
										
										
											2024-04-23 02:27:43 +08:00
										 |  |  |             metadatas=metadatas, | 
					
						
							|  |  |  |             embeddings=embeddings, | 
					
						
							|  |  |  |             documents=texts, | 
					
						
							|  |  |  |         ): | 
					
						
							|  |  |  |             collection.add(*batch) | 
					
						
							| 
									
										
										
										
											2024-04-09 22:38:40 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 07:15:39 +08:00
										 |  |  |         return True | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |     except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-04-01 03:17:29 +08:00
										 |  |  |         log.exception(e) | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  |         if e.__class__.__name__ == "UniqueConstraintError": | 
					
						
							|  |  |  |             return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | def get_loader(filename: str, file_content_type: str, file_path: str): | 
					
						
							|  |  |  |     file_ext = filename.split(".")[-1].lower() | 
					
						
							| 
									
										
										
										
											2024-01-25 16:24:49 +08:00
										 |  |  |     known_type = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     known_source_ext = [ | 
					
						
							|  |  |  |         "go", | 
					
						
							|  |  |  |         "py", | 
					
						
							|  |  |  |         "java", | 
					
						
							|  |  |  |         "sh", | 
					
						
							|  |  |  |         "bat", | 
					
						
							|  |  |  |         "ps1", | 
					
						
							|  |  |  |         "cmd", | 
					
						
							|  |  |  |         "js", | 
					
						
							|  |  |  |         "ts", | 
					
						
							|  |  |  |         "css", | 
					
						
							|  |  |  |         "cpp", | 
					
						
							|  |  |  |         "hpp", | 
					
						
							|  |  |  |         "h", | 
					
						
							|  |  |  |         "c", | 
					
						
							|  |  |  |         "cs", | 
					
						
							|  |  |  |         "sql", | 
					
						
							|  |  |  |         "log", | 
					
						
							|  |  |  |         "ini", | 
					
						
							|  |  |  |         "pl", | 
					
						
							|  |  |  |         "pm", | 
					
						
							|  |  |  |         "r", | 
					
						
							|  |  |  |         "dart", | 
					
						
							|  |  |  |         "dockerfile", | 
					
						
							|  |  |  |         "env", | 
					
						
							|  |  |  |         "php", | 
					
						
							|  |  |  |         "hs", | 
					
						
							|  |  |  |         "hsc", | 
					
						
							|  |  |  |         "lua", | 
					
						
							|  |  |  |         "nginxconf", | 
					
						
							|  |  |  |         "conf", | 
					
						
							|  |  |  |         "m", | 
					
						
							|  |  |  |         "mm", | 
					
						
							|  |  |  |         "plsql", | 
					
						
							|  |  |  |         "perl", | 
					
						
							|  |  |  |         "rb", | 
					
						
							|  |  |  |         "rs", | 
					
						
							|  |  |  |         "db2", | 
					
						
							|  |  |  |         "scala", | 
					
						
							|  |  |  |         "bash", | 
					
						
							|  |  |  |         "swift", | 
					
						
							|  |  |  |         "vue", | 
					
						
							|  |  |  |         "svelte", | 
					
						
							| 
									
										
										
										
											2024-06-08 12:41:30 +08:00
										 |  |  |         "msg", | 
					
						
							| 
									
										
										
										
											2024-01-25 16:24:49 +08:00
										 |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if file_ext == "pdf": | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |         loader = PyPDFLoader( | 
					
						
							| 
									
										
										
										
											2024-05-10 15:03:24 +08:00
										 |  |  |             file_path, extract_images=app.state.config.PDF_EXTRACT_IMAGES | 
					
						
							| 
									
										
										
										
											2024-05-10 13:36:10 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-01-25 16:24:49 +08:00
										 |  |  |     elif file_ext == "csv": | 
					
						
							|  |  |  |         loader = CSVLoader(file_path) | 
					
						
							|  |  |  |     elif file_ext == "rst": | 
					
						
							|  |  |  |         loader = UnstructuredRSTLoader(file_path, mode="elements") | 
					
						
							|  |  |  |     elif file_ext == "xml": | 
					
						
							|  |  |  |         loader = UnstructuredXMLLoader(file_path) | 
					
						
							| 
									
										
										
										
											2024-03-25 16:50:53 +08:00
										 |  |  |     elif file_ext in ["htm", "html"]: | 
					
						
							| 
									
										
										
										
											2024-03-26 14:50:52 +08:00
										 |  |  |         loader = BSHTMLLoader(file_path, open_encoding="unicode_escape") | 
					
						
							| 
									
										
										
										
											2024-01-25 16:24:49 +08:00
										 |  |  |     elif file_ext == "md": | 
					
						
							|  |  |  |         loader = UnstructuredMarkdownLoader(file_path) | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  |     elif file_content_type == "application/epub+zip": | 
					
						
							| 
									
										
										
										
											2024-01-25 16:24:49 +08:00
										 |  |  |         loader = UnstructuredEPubLoader(file_path) | 
					
						
							|  |  |  |     elif ( | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  |         file_content_type | 
					
						
							| 
									
										
										
										
											2024-01-25 16:24:49 +08:00
										 |  |  |         == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | 
					
						
							|  |  |  |         or file_ext in ["doc", "docx"] | 
					
						
							|  |  |  |     ): | 
					
						
							|  |  |  |         loader = Docx2txtLoader(file_path) | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  |     elif file_content_type in [ | 
					
						
							| 
									
										
										
										
											2024-01-25 16:24:49 +08:00
										 |  |  |         "application/vnd.ms-excel", | 
					
						
							|  |  |  |         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | 
					
						
							|  |  |  |     ] or file_ext in ["xls", "xlsx"]: | 
					
						
							|  |  |  |         loader = UnstructuredExcelLoader(file_path) | 
					
						
							| 
									
										
										
										
											2024-05-20 22:22:43 +08:00
										 |  |  |     elif file_content_type in [ | 
					
						
							|  |  |  |         "application/vnd.ms-powerpoint", | 
					
						
							|  |  |  |         "application/vnd.openxmlformats-officedocument.presentationml.presentation", | 
					
						
							|  |  |  |     ] or file_ext in ["ppt", "pptx"]: | 
					
						
							|  |  |  |         loader = UnstructuredPowerPointLoader(file_path) | 
					
						
							| 
									
										
										
										
											2024-06-08 12:18:04 +08:00
										 |  |  |     elif file_ext == "msg": | 
					
						
							|  |  |  |         loader = OutlookMessageLoader(file_path) | 
					
						
							| 
									
										
										
										
											2024-03-03 10:56:57 +08:00
										 |  |  |     elif file_ext in known_source_ext or ( | 
					
						
							|  |  |  |         file_content_type and file_content_type.find("text/") >= 0 | 
					
						
							|  |  |  |     ): | 
					
						
							| 
									
										
										
										
											2024-03-16 14:52:37 +08:00
										 |  |  |         loader = TextLoader(file_path, autodetect_encoding=True) | 
					
						
							| 
									
										
										
										
											2024-01-25 16:24:49 +08:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2024-03-16 14:52:37 +08:00
										 |  |  |         loader = TextLoader(file_path, autodetect_encoding=True) | 
					
						
							| 
									
										
										
										
											2024-01-25 16:24:49 +08:00
										 |  |  |         known_type = False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return loader, known_type | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | @app.post("/doc") | 
					
						
							| 
									
										
										
										
											2024-01-07 18:46:12 +08:00
										 |  |  | def store_doc( | 
					
						
							| 
									
										
										
										
											2024-01-08 01:00:30 +08:00
										 |  |  |     collection_name: Optional[str] = Form(None), | 
					
						
							| 
									
										
										
										
											2024-01-07 18:46:12 +08:00
										 |  |  |     file: UploadFile = File(...), | 
					
						
							|  |  |  |     user=Depends(get_current_user), | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" | 
					
						
							| 
									
										
										
										
											2024-01-07 15:40:51 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  |     log.info(f"file.content_type: {file.content_type}") | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2024-04-02 04:55:14 +08:00
										 |  |  |         unsanitized_filename = file.filename | 
					
						
							| 
									
										
										
										
											2024-04-05 08:38:59 +08:00
										 |  |  |         filename = os.path.basename(unsanitized_filename) | 
					
						
							| 
									
										
										
										
											2024-04-02 04:55:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-05 08:38:59 +08:00
										 |  |  |         file_path = f"{UPLOAD_DIR}/{filename}" | 
					
						
							| 
									
										
										
										
											2024-04-02 04:55:14 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |         contents = file.file.read() | 
					
						
							| 
									
										
										
										
											2024-01-07 15:40:51 +08:00
										 |  |  |         with open(file_path, "wb") as f: | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |             f.write(contents) | 
					
						
							|  |  |  |             f.close() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-08 01:00:30 +08:00
										 |  |  |         f = open(file_path, "rb") | 
					
						
							|  |  |  |         if collection_name == None: | 
					
						
							|  |  |  |             collection_name = calculate_sha256(f)[:63] | 
					
						
							|  |  |  |         f.close() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-05 08:38:59 +08:00
										 |  |  |         loader, known_type = get_loader(filename, file.content_type, file_path) | 
					
						
							| 
									
										
										
										
											2024-01-07 15:40:51 +08:00
										 |  |  |         data = loader.load() | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             result = store_data_in_vector_db(data, collection_name) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if result: | 
					
						
							|  |  |  |                 return { | 
					
						
							|  |  |  |                     "status": True, | 
					
						
							|  |  |  |                     "collection_name": collection_name, | 
					
						
							|  |  |  |                     "filename": filename, | 
					
						
							|  |  |  |                     "known_type": known_type, | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |         except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-01-07 17:40:36 +08:00
										 |  |  |             raise HTTPException( | 
					
						
							|  |  |  |                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  |                 detail=e, | 
					
						
							| 
									
										
										
										
											2024-01-07 17:40:36 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  |     except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  |         log.exception(e) | 
					
						
							| 
									
										
										
										
											2024-01-13 21:46:56 +08:00
										 |  |  |         if "No pandoc was found" in str(e): | 
					
						
							|  |  |  |             raise HTTPException( | 
					
						
							|  |  |  |                 status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |                 detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise HTTPException( | 
					
						
							|  |  |  |                 status_code=status.HTTP_400_BAD_REQUEST, | 
					
						
							|  |  |  |                 detail=ERROR_MESSAGES.DEFAULT(e), | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-01-07 14:59:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  | class TextRAGForm(BaseModel): | 
					
						
							|  |  |  |     name: str | 
					
						
							|  |  |  |     content: str | 
					
						
							|  |  |  |     collection_name: Optional[str] = None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @app.post("/text") | 
					
						
							|  |  |  | def store_text( | 
					
						
							|  |  |  |     form_data: TextRAGForm, | 
					
						
							|  |  |  |     user=Depends(get_current_user), | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     collection_name = form_data.collection_name | 
					
						
							|  |  |  |     if collection_name == None: | 
					
						
							|  |  |  |         collection_name = calculate_sha256_string(form_data.content) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-24 15:41:41 +08:00
										 |  |  |     result = store_text_in_vector_db( | 
					
						
							|  |  |  |         form_data.content, | 
					
						
							|  |  |  |         metadata={"name": form_data.name, "created_by": user.id}, | 
					
						
							|  |  |  |         collection_name=collection_name, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-03-24 15:40:27 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if result: | 
					
						
							|  |  |  |         return {"status": True, "collection_name": collection_name} | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         raise HTTPException( | 
					
						
							|  |  |  |             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | 
					
						
							|  |  |  |             detail=ERROR_MESSAGES.DEFAULT(), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | @app.get("/scan") | 
					
						
							|  |  |  | def scan_docs_dir(user=Depends(get_admin_user)): | 
					
						
							| 
									
										
										
										
											2024-02-23 18:57:31 +08:00
										 |  |  |     for path in Path(DOCS_DIR).rglob("./**/*"): | 
					
						
							|  |  |  |         try: | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  |             if path.is_file() and not path.name.startswith("."): | 
					
						
							|  |  |  |                 tags = extract_folders_after_data_docs(path) | 
					
						
							|  |  |  |                 filename = path.name | 
					
						
							|  |  |  |                 file_content_type = mimetypes.guess_type(path) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 f = open(path, "rb") | 
					
						
							|  |  |  |                 collection_name = calculate_sha256(f)[:63] | 
					
						
							|  |  |  |                 f.close() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-18 13:31:46 +08:00
										 |  |  |                 loader, known_type = get_loader( | 
					
						
							|  |  |  |                     filename, file_content_type[0], str(path) | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  |                 data = loader.load() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  |                 try: | 
					
						
							|  |  |  |                     result = store_data_in_vector_db(data, collection_name) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                     if result: | 
					
						
							|  |  |  |                         sanitized_filename = sanitize_filename(filename) | 
					
						
							|  |  |  |                         doc = Documents.get_doc_by_name(sanitized_filename) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                         if doc == None: | 
					
						
							|  |  |  |                             doc = Documents.insert_new_doc( | 
					
						
							|  |  |  |                                 user.id, | 
					
						
							|  |  |  |                                 DocumentForm( | 
					
						
							|  |  |  |                                     **{ | 
					
						
							|  |  |  |                                         "name": sanitized_filename, | 
					
						
							|  |  |  |                                         "title": filename, | 
					
						
							|  |  |  |                                         "collection_name": collection_name, | 
					
						
							|  |  |  |                                         "filename": filename, | 
					
						
							|  |  |  |                                         "content": ( | 
					
						
							|  |  |  |                                             json.dumps( | 
					
						
							|  |  |  |                                                 { | 
					
						
							|  |  |  |                                                     "tags": list( | 
					
						
							|  |  |  |                                                         map( | 
					
						
							|  |  |  |                                                             lambda name: {"name": name}, | 
					
						
							|  |  |  |                                                             tags, | 
					
						
							|  |  |  |                                                         ) | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  |                                                     ) | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  |                                                 } | 
					
						
							|  |  |  |                                             ) | 
					
						
							|  |  |  |                                             if len(tags) | 
					
						
							|  |  |  |                                             else "{}" | 
					
						
							|  |  |  |                                         ), | 
					
						
							|  |  |  |                                     } | 
					
						
							|  |  |  |                                 ), | 
					
						
							|  |  |  |                             ) | 
					
						
							|  |  |  |                 except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-04-01 03:17:29 +08:00
										 |  |  |                     log.exception(e) | 
					
						
							| 
									
										
										
										
											2024-03-26 14:47:08 +08:00
										 |  |  |                     pass | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-23 18:57:31 +08:00
										 |  |  |         except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  |             log.exception(e) | 
					
						
							| 
									
										
										
										
											2024-02-18 13:06:08 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 17:40:36 +08:00
										 |  |  | @app.get("/reset/db") | 
					
						
							| 
									
										
										
										
											2024-02-09 08:05:01 +08:00
										 |  |  | def reset_vector_db(user=Depends(get_admin_user)): | 
					
						
							|  |  |  |     CHROMA_CLIENT.reset() | 
					
						
							| 
									
										
										
										
											2024-01-07 17:40:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-04 12:45:36 +08:00
										 |  |  | @app.get("/reset/uploads") | 
					
						
							|  |  |  | def reset_upload_dir(user=Depends(get_admin_user)) -> bool: | 
					
						
							|  |  |  |     folder = f"{UPLOAD_DIR}" | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         # Check if the directory exists | 
					
						
							|  |  |  |         if os.path.exists(folder): | 
					
						
							|  |  |  |             # Iterate over all the files and directories in the specified directory | 
					
						
							|  |  |  |             for filename in os.listdir(folder): | 
					
						
							|  |  |  |                 file_path = os.path.join(folder, filename) | 
					
						
							|  |  |  |                 try: | 
					
						
							|  |  |  |                     if os.path.isfile(file_path) or os.path.islink(file_path): | 
					
						
							|  |  |  |                         os.unlink(file_path)  # Remove the file or link | 
					
						
							|  |  |  |                     elif os.path.isdir(file_path): | 
					
						
							|  |  |  |                         shutil.rmtree(file_path)  # Remove the directory | 
					
						
							|  |  |  |                 except Exception as e: | 
					
						
							|  |  |  |                     print(f"Failed to delete {file_path}. Reason: {e}") | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             print(f"The directory {folder} does not exist") | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         print(f"Failed to process the directory {folder}. Reason: {e}") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-07 17:40:36 +08:00
										 |  |  | @app.get("/reset") | 
					
						
							| 
									
										
										
										
											2024-02-09 08:05:01 +08:00
										 |  |  | def reset(user=Depends(get_admin_user)) -> bool: | 
					
						
							|  |  |  |     folder = f"{UPLOAD_DIR}" | 
					
						
							|  |  |  |     for filename in os.listdir(folder): | 
					
						
							|  |  |  |         file_path = os.path.join(folder, filename) | 
					
						
							| 
									
										
										
										
											2024-01-07 17:40:36 +08:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2024-02-09 08:05:01 +08:00
										 |  |  |             if os.path.isfile(file_path) or os.path.islink(file_path): | 
					
						
							|  |  |  |                 os.unlink(file_path) | 
					
						
							|  |  |  |             elif os.path.isdir(file_path): | 
					
						
							|  |  |  |                 shutil.rmtree(file_path) | 
					
						
							| 
									
										
										
										
											2024-01-07 17:40:36 +08:00
										 |  |  |         except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  |             log.error("Failed to delete %s. Reason: %s" % (file_path, e)) | 
					
						
							| 
									
										
										
										
											2024-01-07 17:40:36 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-09 08:05:01 +08:00
										 |  |  |     try: | 
					
						
							|  |  |  |         CHROMA_CLIENT.reset() | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							| 
									
										
										
										
											2024-03-21 07:11:36 +08:00
										 |  |  |         log.exception(e) | 
					
						
							| 
									
										
										
										
											2024-02-09 08:05:01 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return True | 
					
						
							| 
									
										
										
										
											2024-05-19 21:51:32 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-12 15:18:22 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-11 23:06:14 +08:00
										 |  |  | class SafeWebBaseLoader(WebBaseLoader): | 
					
						
							|  |  |  |     """WebBaseLoader with enhanced error handling for URLs.""" | 
					
						
							| 
									
										
										
										
											2024-06-12 15:18:22 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-11 23:06:14 +08:00
										 |  |  |     def lazy_load(self) -> Iterator[Document]: | 
					
						
							|  |  |  |         """Lazy load text from the url(s) in web_path with error handling.""" | 
					
						
							|  |  |  |         for path in self.web_paths: | 
					
						
							|  |  |  |             try: | 
					
						
							|  |  |  |                 soup = self._scrape(path, bs_kwargs=self.bs_kwargs) | 
					
						
							|  |  |  |                 text = soup.get_text(**self.bs_get_text_kwargs) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # Build metadata | 
					
						
							|  |  |  |                 metadata = {"source": path} | 
					
						
							|  |  |  |                 if title := soup.find("title"): | 
					
						
							|  |  |  |                     metadata["title"] = title.get_text() | 
					
						
							|  |  |  |                 if description := soup.find("meta", attrs={"name": "description"}): | 
					
						
							| 
									
										
										
										
											2024-06-12 15:18:22 +08:00
										 |  |  |                     metadata["description"] = description.get( | 
					
						
							|  |  |  |                         "content", "No description found." | 
					
						
							|  |  |  |                     ) | 
					
						
							| 
									
										
										
										
											2024-06-11 23:06:14 +08:00
										 |  |  |                 if html := soup.find("html"): | 
					
						
							|  |  |  |                     metadata["language"] = html.get("lang", "No language found.") | 
					
						
							| 
									
										
										
										
											2024-06-12 15:18:22 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-11 23:06:14 +08:00
										 |  |  |                 yield Document(page_content=text, metadata=metadata) | 
					
						
							|  |  |  |             except Exception as e: | 
					
						
							|  |  |  |                 # Log the error and continue with the next URL | 
					
						
							|  |  |  |                 log.error(f"Error loading {path}: {e}") | 
					
						
							| 
									
										
										
										
											2024-06-12 15:18:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-19 21:51:32 +08:00
										 |  |  | if ENV == "dev": | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @app.get("/ef") | 
					
						
							|  |  |  |     async def get_embeddings(): | 
					
						
							|  |  |  |         return {"result": app.state.EMBEDDING_FUNCTION("hello world")} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @app.get("/ef/{text}") | 
					
						
							|  |  |  |     async def get_embeddings_text(text: str): | 
					
						
							|  |  |  |         return {"result": app.state.EMBEDDING_FUNCTION(text)} |