| 
									
										
										
										
											2024-05-06 12:27:46 +08:00
										 |  |  | import json | 
					
						
							|  |  |  | import logging | 
					
						
							| 
									
										
										
										
											2024-08-14 20:46:31 +08:00
										 |  |  | from typing import Optional | 
					
						
							| 
									
										
										
										
											2024-05-06 12:27:46 +08:00
										 |  |  | import requests | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-17 15:38:11 +08:00
										 |  |  | from apps.rag.search.main import SearchResult, get_filtered_results | 
					
						
							| 
									
										
										
										
											2024-06-02 10:57:00 +08:00
										 |  |  | from config import SRC_LOG_LEVELS | 
					
						
							| 
									
										
										
										
											2024-05-06 12:27:46 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | log = logging.getLogger(__name__) | 
					
						
							|  |  |  | log.setLevel(SRC_LOG_LEVELS["RAG"]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def search_serpstack( | 
					
						
							| 
									
										
										
										
											2024-06-18 05:32:23 +08:00
										 |  |  |     api_key: str, | 
					
						
							|  |  |  |     query: str, | 
					
						
							|  |  |  |     count: int, | 
					
						
							| 
									
										
										
										
											2024-08-14 20:46:31 +08:00
										 |  |  |     filter_list: Optional[list[str]] = None, | 
					
						
							| 
									
										
										
										
											2024-06-18 05:32:23 +08:00
										 |  |  |     https_enabled: bool = True, | 
					
						
							| 
									
										
										
										
											2024-05-06 12:27:46 +08:00
										 |  |  | ) -> list[SearchResult]: | 
					
						
							|  |  |  |     """Search using serpstack.com's and return the results as a list of SearchResult objects.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         api_key (str): A serpstack.com API key | 
					
						
							|  |  |  |         query (str): The query to search for | 
					
						
							|  |  |  |         https_enabled (bool): Whether to use HTTPS or HTTP for the API request | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     url = f"{'https' if https_enabled else 'http'}://api.serpstack.com/search" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     headers = {"Content-Type": "application/json"} | 
					
						
							|  |  |  |     params = { | 
					
						
							|  |  |  |         "access_key": api_key, | 
					
						
							|  |  |  |         "query": query, | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     response = requests.request("POST", url, headers=headers, params=params) | 
					
						
							|  |  |  |     response.raise_for_status() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     json_response = response.json() | 
					
						
							|  |  |  |     results = sorted( | 
					
						
							|  |  |  |         json_response.get("organic_results", []), key=lambda x: x.get("position", 0) | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-06-17 15:38:11 +08:00
										 |  |  |     if filter_list: | 
					
						
							|  |  |  |         results = get_filtered_results(results, filter_list) | 
					
						
							| 
									
										
										
										
											2024-05-06 12:27:46 +08:00
										 |  |  |     return [ | 
					
						
							|  |  |  |         SearchResult( | 
					
						
							|  |  |  |             link=result["url"], title=result.get("title"), snippet=result.get("snippet") | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-06-17 15:38:11 +08:00
										 |  |  |         for result in results[:count] | 
					
						
							| 
									
										
										
										
											2024-05-06 12:27:46 +08:00
										 |  |  |     ] |