| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  | import requests | 
					
						
							|  |  |  | import logging | 
					
						
							|  |  |  | from typing import Iterator, List, Literal, Union | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from langchain_core.document_loaders import BaseLoader | 
					
						
							|  |  |  | from langchain_core.documents import Document | 
					
						
							|  |  |  | from open_webui.env import SRC_LOG_LEVELS | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | log = logging.getLogger(__name__) | 
					
						
							|  |  |  | log.setLevel(SRC_LOG_LEVELS["RAG"]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  | class TavilyLoader(BaseLoader): | 
					
						
							|  |  |  |     """Extract web page content from URLs using Tavily Extract API.
 | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |     This is a LangChain document loader that uses Tavily's Extract API to | 
					
						
							|  |  |  |     retrieve content from web pages and return it as Document objects. | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |     Args: | 
					
						
							|  |  |  |         urls: URL or list of URLs to extract content from. | 
					
						
							|  |  |  |         api_key: The Tavily API key. | 
					
						
							|  |  |  |         extract_depth: Depth of extraction, either "basic" or "advanced". | 
					
						
							|  |  |  |         continue_on_failure: Whether to continue if extraction of a URL fails. | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |     def __init__( | 
					
						
							|  |  |  |         self, | 
					
						
							|  |  |  |         urls: Union[str, List[str]], | 
					
						
							|  |  |  |         api_key: str, | 
					
						
							|  |  |  |         extract_depth: Literal["basic", "advanced"] = "basic", | 
					
						
							|  |  |  |         continue_on_failure: bool = True, | 
					
						
							|  |  |  |     ) -> None: | 
					
						
							|  |  |  |         """Initialize Tavily Extract client.
 | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |         Args: | 
					
						
							|  |  |  |             urls: URL or list of URLs to extract content from. | 
					
						
							|  |  |  |             api_key: The Tavily API key. | 
					
						
							|  |  |  |             include_images: Whether to include images in the extraction. | 
					
						
							|  |  |  |             extract_depth: Depth of extraction, either "basic" or "advanced". | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  |                 advanced extraction retrieves more data, including tables and | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |                 embedded content, with higher success but may increase latency. | 
					
						
							|  |  |  |                 basic costs 1 credit per 5 successful URL extractions, | 
					
						
							|  |  |  |                 advanced costs 2 credits per 5 successful URL extractions. | 
					
						
							|  |  |  |             continue_on_failure: Whether to continue if extraction of a URL fails. | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         if not urls: | 
					
						
							|  |  |  |             raise ValueError("At least one URL must be provided.") | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |         self.api_key = api_key | 
					
						
							|  |  |  |         self.urls = urls if isinstance(urls, list) else [urls] | 
					
						
							|  |  |  |         self.extract_depth = extract_depth | 
					
						
							|  |  |  |         self.continue_on_failure = continue_on_failure | 
					
						
							|  |  |  |         self.api_url = "https://api.tavily.com/extract" | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |     def lazy_load(self) -> Iterator[Document]: | 
					
						
							|  |  |  |         """Extract and yield documents from the URLs using Tavily Extract API.""" | 
					
						
							|  |  |  |         batch_size = 20 | 
					
						
							|  |  |  |         for i in range(0, len(self.urls), batch_size): | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  |             batch_urls = self.urls[i : i + batch_size] | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |             try: | 
					
						
							|  |  |  |                 headers = { | 
					
						
							|  |  |  |                     "Content-Type": "application/json", | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  |                     "Authorization": f"Bearer {self.api_key}", | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |                 } | 
					
						
							|  |  |  |                 # Use string for single URL, array for multiple URLs | 
					
						
							|  |  |  |                 urls_param = batch_urls[0] if len(batch_urls) == 1 else batch_urls | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  |                 payload = {"urls": urls_param, "extract_depth": self.extract_depth} | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |                 # Make the API call | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  |                 response = requests.post(self.api_url, headers=headers, json=payload) | 
					
						
							| 
									
										
										
										
											2025-03-07 07:12:26 +08:00
										 |  |  |                 response.raise_for_status() | 
					
						
							|  |  |  |                 response_data = response.json() | 
					
						
							|  |  |  |                 # Process successful results | 
					
						
							|  |  |  |                 for result in response_data.get("results", []): | 
					
						
							|  |  |  |                     url = result.get("url", "") | 
					
						
							|  |  |  |                     content = result.get("raw_content", "") | 
					
						
							|  |  |  |                     if not content: | 
					
						
							|  |  |  |                         log.warning(f"No content extracted from {url}") | 
					
						
							|  |  |  |                         continue | 
					
						
							|  |  |  |                     # Add URLs as metadata | 
					
						
							|  |  |  |                     metadata = {"source": url} | 
					
						
							|  |  |  |                     yield Document( | 
					
						
							|  |  |  |                         page_content=content, | 
					
						
							|  |  |  |                         metadata=metadata, | 
					
						
							|  |  |  |                     ) | 
					
						
							|  |  |  |                 for failed in response_data.get("failed_results", []): | 
					
						
							|  |  |  |                     url = failed.get("url", "") | 
					
						
							|  |  |  |                     error = failed.get("error", "Unknown error") | 
					
						
							|  |  |  |                     log.error(f"Failed to extract content from {url}: {error}") | 
					
						
							|  |  |  |             except Exception as e: | 
					
						
							|  |  |  |                 if self.continue_on_failure: | 
					
						
							|  |  |  |                     log.error(f"Error extracting content from batch {batch_urls}: {e}") | 
					
						
							|  |  |  |                 else: | 
					
						
							| 
									
										
										
										
											2025-03-10 22:27:31 +08:00
										 |  |  |                     raise e |