| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  | from elasticsearch import Elasticsearch, BadRequestError | 
					
						
							|  |  |  | from typing import Optional | 
					
						
							|  |  |  | import ssl | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | from elasticsearch.helpers import bulk, scan | 
					
						
							| 
									
										
										
											
												feat: Add abstract base class for vector database integration
- Created `VectorDBBase` as an abstract base class to standardize vector database operations.
- Added required methods for common vector database operations: `has_collection`, `delete_collection`, `insert`, `upsert`, `search`, `query`, `get`, `delete`, `reset`.
- The base class can now be extended by any vector database implementation (e.g., Qdrant, Pinecone) to ensure a consistent API across different database systems.
											
										 
											2025-04-21 13:26:08 +08:00
										 |  |  | from open_webui.retrieval.vector.main import ( | 
					
						
							|  |  |  |     VectorDBBase, | 
					
						
							|  |  |  |     VectorItem, | 
					
						
							|  |  |  |     SearchResult, | 
					
						
							|  |  |  |     GetResult, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  | from open_webui.config import ( | 
					
						
							|  |  |  |     ELASTICSEARCH_URL, | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |     ELASTICSEARCH_CA_CERTS, | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     ELASTICSEARCH_API_KEY, | 
					
						
							|  |  |  |     ELASTICSEARCH_USERNAME, | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |     ELASTICSEARCH_PASSWORD, | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     ELASTICSEARCH_CLOUD_ID, | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |     ELASTICSEARCH_INDEX_PREFIX, | 
					
						
							| 
									
										
										
										
											2025-03-04 16:32:27 +08:00
										 |  |  |     SSL_ASSERT_FINGERPRINT, | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
											
												feat: Add abstract base class for vector database integration
- Created `VectorDBBase` as an abstract base class to standardize vector database operations.
- Added required methods for common vector database operations: `has_collection`, `delete_collection`, `insert`, `upsert`, `search`, `query`, `get`, `delete`, `reset`.
- The base class can now be extended by any vector database implementation (e.g., Qdrant, Pinecone) to ensure a consistent API across different database systems.
											
										 
											2025-04-21 13:26:08 +08:00
										 |  |  | class ElasticsearchClient(VectorDBBase): | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     Important: | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |     in order to reduce the number of indexes and since the embedding vector length is fixed, we avoid creating | 
					
						
							|  |  |  |     an index for each file but store it as a text field, while seperating to different index | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     baesd on the embedding length. | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def __init__(self): | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |         self.index_prefix = ELASTICSEARCH_INDEX_PREFIX | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         self.client = Elasticsearch( | 
					
						
							|  |  |  |             hosts=[ELASTICSEARCH_URL], | 
					
						
							|  |  |  |             ca_certs=ELASTICSEARCH_CA_CERTS, | 
					
						
							|  |  |  |             api_key=ELASTICSEARCH_API_KEY, | 
					
						
							|  |  |  |             cloud_id=ELASTICSEARCH_CLOUD_ID, | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |             basic_auth=( | 
					
						
							|  |  |  |                 (ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD) | 
					
						
							|  |  |  |                 if ELASTICSEARCH_USERNAME and ELASTICSEARCH_PASSWORD | 
					
						
							|  |  |  |                 else None | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             ssl_assert_fingerprint=SSL_ASSERT_FINGERPRINT, | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Status: works | 
					
						
							|  |  |  |     def _get_index_name(self, dimension: int) -> str: | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         return f"{self.index_prefix}_d{str(dimension)}" | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Status: works | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def _scan_result_to_get_result(self, result) -> GetResult: | 
					
						
							|  |  |  |         if not result: | 
					
						
							|  |  |  |             return None | 
					
						
							|  |  |  |         ids = [] | 
					
						
							|  |  |  |         documents = [] | 
					
						
							|  |  |  |         metadatas = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for hit in result: | 
					
						
							|  |  |  |             ids.append(hit["_id"]) | 
					
						
							|  |  |  |             documents.append(hit["_source"].get("text")) | 
					
						
							|  |  |  |             metadatas.append(hit["_source"].get("metadata")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return GetResult(ids=[ids], documents=[documents], metadatas=[metadatas]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |     # Status: works | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def _result_to_get_result(self, result) -> GetResult: | 
					
						
							|  |  |  |         if not result["hits"]["hits"]: | 
					
						
							|  |  |  |             return None | 
					
						
							|  |  |  |         ids = [] | 
					
						
							|  |  |  |         documents = [] | 
					
						
							|  |  |  |         metadatas = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for hit in result["hits"]["hits"]: | 
					
						
							|  |  |  |             ids.append(hit["_id"]) | 
					
						
							|  |  |  |             documents.append(hit["_source"].get("text")) | 
					
						
							|  |  |  |             metadatas.append(hit["_source"].get("metadata")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return GetResult(ids=[ids], documents=[documents], metadatas=[metadatas]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |     # Status: works | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def _result_to_search_result(self, result) -> SearchResult: | 
					
						
							|  |  |  |         ids = [] | 
					
						
							|  |  |  |         distances = [] | 
					
						
							|  |  |  |         documents = [] | 
					
						
							|  |  |  |         metadatas = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for hit in result["hits"]["hits"]: | 
					
						
							|  |  |  |             ids.append(hit["_id"]) | 
					
						
							|  |  |  |             distances.append(hit["_score"]) | 
					
						
							|  |  |  |             documents.append(hit["_source"].get("text")) | 
					
						
							|  |  |  |             metadatas.append(hit["_source"].get("metadata")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return SearchResult( | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |             ids=[ids], | 
					
						
							|  |  |  |             distances=[distances], | 
					
						
							|  |  |  |             documents=[documents], | 
					
						
							|  |  |  |             metadatas=[metadatas], | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Status: works | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def _create_index(self, dimension: int): | 
					
						
							|  |  |  |         body = { | 
					
						
							|  |  |  |             "mappings": { | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |                 "dynamic_templates": [ | 
					
						
							|  |  |  |                     { | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |                         "strings": { | 
					
						
							|  |  |  |                             "match_mapping_type": "string", | 
					
						
							|  |  |  |                             "mapping": {"type": "keyword"}, | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |                         } | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  |                 ], | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |                 "properties": { | 
					
						
							|  |  |  |                     "collection": {"type": "keyword"}, | 
					
						
							|  |  |  |                     "id": {"type": "keyword"}, | 
					
						
							|  |  |  |                     "vector": { | 
					
						
							|  |  |  |                         "type": "dense_vector", | 
					
						
							|  |  |  |                         "dims": dimension,  # Adjust based on your vector dimensions | 
					
						
							|  |  |  |                         "index": True, | 
					
						
							|  |  |  |                         "similarity": "cosine", | 
					
						
							|  |  |  |                     }, | 
					
						
							|  |  |  |                     "text": {"type": "text"}, | 
					
						
							|  |  |  |                     "metadata": {"type": "object"}, | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |                 }, | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         self.client.indices.create(index=self._get_index_name(dimension), body=body) | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Status: works | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def _create_batches(self, items: list[VectorItem], batch_size=100): | 
					
						
							|  |  |  |         for i in range(0, len(items), batch_size): | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |             yield items[i : min(i + batch_size, len(items))] | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |     # Status: works | 
					
						
							|  |  |  |     def has_collection(self, collection_name) -> bool: | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         query_body = {"query": {"bool": {"filter": []}}} | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |         query_body["query"]["bool"]["filter"].append( | 
					
						
							|  |  |  |             {"term": {"collection": collection_name}} | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         try: | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |             result = self.client.count(index=f"{self.index_prefix}*", body=query_body) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             return result.body["count"] > 0 | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         except Exception as e: | 
					
						
							|  |  |  |             return None | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |     def delete_collection(self, collection_name: str): | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |         query = {"query": {"term": {"collection": collection_name}}} | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |         self.client.delete_by_query(index=f"{self.index_prefix}*", body=query) | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Status: works | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def search( | 
					
						
							|  |  |  |         self, collection_name: str, vectors: list[list[float]], limit: int | 
					
						
							|  |  |  |     ) -> Optional[SearchResult]: | 
					
						
							|  |  |  |         query = { | 
					
						
							|  |  |  |             "size": limit, | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |             "_source": ["text", "metadata"], | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |             "query": { | 
					
						
							|  |  |  |                 "script_score": { | 
					
						
							|  |  |  |                     "query": { | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |                         "bool": {"filter": [{"term": {"collection": collection_name}}]} | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |                     }, | 
					
						
							|  |  |  |                     "script": { | 
					
						
							|  |  |  |                         "source": "cosineSimilarity(params.vector, 'vector') + 1.0", | 
					
						
							|  |  |  |                         "params": { | 
					
						
							|  |  |  |                             "vector": vectors[0] | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |                         },  # Assuming single query vector | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |                     }, | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             }, | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         result = self.client.search( | 
					
						
							|  |  |  |             index=self._get_index_name(len(vectors[0])), body=query | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return self._result_to_search_result(result) | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Status: only tested halfwat | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def query( | 
					
						
							|  |  |  |         self, collection_name: str, filter: dict, limit: Optional[int] = None | 
					
						
							|  |  |  |     ) -> Optional[GetResult]: | 
					
						
							|  |  |  |         if not self.has_collection(collection_name): | 
					
						
							|  |  |  |             return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         query_body = { | 
					
						
							|  |  |  |             "query": {"bool": {"filter": []}}, | 
					
						
							|  |  |  |             "_source": ["text", "metadata"], | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for field, value in filter.items(): | 
					
						
							|  |  |  |             query_body["query"]["bool"]["filter"].append({"term": {field: value}}) | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |         query_body["query"]["bool"]["filter"].append( | 
					
						
							|  |  |  |             {"term": {"collection": collection_name}} | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         size = limit if limit else 10 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             result = self.client.search( | 
					
						
							|  |  |  |                 index=f"{self.index_prefix}*", | 
					
						
							|  |  |  |                 body=query_body, | 
					
						
							|  |  |  |                 size=size, | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |             return self._result_to_get_result(result) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         except Exception as e: | 
					
						
							|  |  |  |             return None | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |     # Status: works | 
					
						
							|  |  |  |     def _has_index(self, dimension: int): | 
					
						
							|  |  |  |         return self.client.indices.exists( | 
					
						
							|  |  |  |             index=self._get_index_name(dimension=dimension) | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def get_or_create_index(self, dimension: int): | 
					
						
							|  |  |  |         if not self._has_index(dimension=dimension): | 
					
						
							|  |  |  |             self._create_index(dimension=dimension) | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Status: works | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def get(self, collection_name: str) -> Optional[GetResult]: | 
					
						
							|  |  |  |         # Get all the items in the collection. | 
					
						
							|  |  |  |         query = { | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |             "query": {"bool": {"filter": [{"term": {"collection": collection_name}}]}}, | 
					
						
							|  |  |  |             "_source": ["text", "metadata"], | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         results = list(scan(self.client, index=f"{self.index_prefix}*", query=query)) | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         return self._scan_result_to_get_result(results) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |     # Status: works | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def insert(self, collection_name: str, items: list[VectorItem]): | 
					
						
							|  |  |  |         if not self._has_index(dimension=len(items[0]["vector"])): | 
					
						
							|  |  |  |             self._create_index(dimension=len(items[0]["vector"])) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for batch in self._create_batches(items): | 
					
						
							|  |  |  |             actions = [ | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |                 { | 
					
						
							|  |  |  |                     "_index": self._get_index_name(dimension=len(items[0]["vector"])), | 
					
						
							|  |  |  |                     "_id": item["id"], | 
					
						
							|  |  |  |                     "_source": { | 
					
						
							|  |  |  |                         "collection": collection_name, | 
					
						
							|  |  |  |                         "vector": item["vector"], | 
					
						
							|  |  |  |                         "text": item["text"], | 
					
						
							|  |  |  |                         "metadata": item["metadata"], | 
					
						
							|  |  |  |                     }, | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |                 for item in batch | 
					
						
							|  |  |  |             ] | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |             bulk(self.client, actions) | 
					
						
							| 
									
										
										
										
											2025-03-04 16:32:27 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |     # Upsert documents using the update API with doc_as_upsert=True. | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |     def upsert(self, collection_name: str, items: list[VectorItem]): | 
					
						
							|  |  |  |         if not self._has_index(dimension=len(items[0]["vector"])): | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |             self._create_index(dimension=len(items[0]["vector"])) | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |         for batch in self._create_batches(items): | 
					
						
							|  |  |  |             actions = [ | 
					
						
							|  |  |  |                 { | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |                     "_op_type": "update", | 
					
						
							|  |  |  |                     "_index": self._get_index_name(dimension=len(item["vector"])), | 
					
						
							| 
									
										
										
										
											2025-03-04 16:32:27 +08:00
										 |  |  |                     "_id": item["id"], | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |                     "doc": { | 
					
						
							|  |  |  |                         "collection": collection_name, | 
					
						
							| 
									
										
										
										
											2025-03-04 16:32:27 +08:00
										 |  |  |                         "vector": item["vector"], | 
					
						
							|  |  |  |                         "text": item["text"], | 
					
						
							|  |  |  |                         "metadata": item["metadata"], | 
					
						
							|  |  |  |                     }, | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |                     "doc_as_upsert": True, | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  |                 } | 
					
						
							|  |  |  |                 for item in batch | 
					
						
							|  |  |  |             ] | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |             bulk(self.client, actions) | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Delete specific documents from a collection by filtering on both collection and document IDs. | 
					
						
							|  |  |  |     def delete( | 
					
						
							|  |  |  |         self, | 
					
						
							|  |  |  |         collection_name: str, | 
					
						
							|  |  |  |         ids: Optional[list[str]] = None, | 
					
						
							|  |  |  |         filter: Optional[dict] = None, | 
					
						
							|  |  |  |     ): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         query = { | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |             "query": {"bool": {"filter": [{"term": {"collection": collection_name}}]}} | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |         # logic based on chromaDB | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  |         if ids: | 
					
						
							|  |  |  |             query["query"]["bool"]["filter"].append({"terms": {"_id": ids}}) | 
					
						
							|  |  |  |         elif filter: | 
					
						
							|  |  |  |             for field, value in filter.items(): | 
					
						
							| 
									
										
										
										
											2025-03-06 11:17:41 +08:00
										 |  |  |                 query["query"]["bool"]["filter"].append( | 
					
						
							|  |  |  |                     {"term": {f"metadata.{field}": value}} | 
					
						
							|  |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2025-03-06 05:19:56 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         self.client.delete_by_query(index=f"{self.index_prefix}*", body=query) | 
					
						
							| 
									
										
										
										
											2025-03-04 05:39:42 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def reset(self): | 
					
						
							|  |  |  |         indices = self.client.indices.get(index=f"{self.index_prefix}*") | 
					
						
							|  |  |  |         for index in indices: | 
					
						
							|  |  |  |             self.client.indices.delete(index=index) |