mirror of https://github.com/minio/minio.git
				
				
				
			export cluster health as prometheus metrics (#17741)
This commit is contained in:
		
							parent
							
								
									c2edbfae55
								
							
						
					
					
						commit
						114fab4c70
					
				|  | @ -2062,8 +2062,12 @@ type HealthOptions struct { | |||
| type HealthResult struct { | ||||
| 	Healthy        bool | ||||
| 	HealingDrives  int | ||||
| 	UnhealthyPools []struct { | ||||
| 		Maintenance   bool | ||||
| 		PoolID, SetID int | ||||
| 		WriteQuorum   int | ||||
| 	} | ||||
| 	WriteQuorum   int | ||||
| 	UsingDefaults bool | ||||
| } | ||||
| 
 | ||||
|  | @ -2164,24 +2168,6 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea | |||
| 		usingDefaults = true | ||||
| 	} | ||||
| 
 | ||||
| 	for poolIdx := range erasureSetUpCount { | ||||
| 		for setIdx := range erasureSetUpCount[poolIdx] { | ||||
| 			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] { | ||||
| 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo), | ||||
| 					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", | ||||
| 						poolIdx, setIdx, poolWriteQuorums[poolIdx])) | ||||
| 				return HealthResult{ | ||||
| 					Healthy:       false, | ||||
| 					HealingDrives: len(aggHealStateResult.HealDisks), | ||||
| 					PoolID:        poolIdx, | ||||
| 					SetID:         setIdx, | ||||
| 					WriteQuorum:   poolWriteQuorums[poolIdx], | ||||
| 					UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	var maximumWriteQuorum int | ||||
| 	for _, writeQuorum := range poolWriteQuorums { | ||||
| 		if maximumWriteQuorum == 0 { | ||||
|  | @ -2192,6 +2178,35 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	result := HealthResult{ | ||||
| 		HealingDrives: len(aggHealStateResult.HealDisks), | ||||
| 		WriteQuorum:   maximumWriteQuorum, | ||||
| 		UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 | ||||
| 	} | ||||
| 
 | ||||
| 	for poolIdx := range erasureSetUpCount { | ||||
| 		for setIdx := range erasureSetUpCount[poolIdx] { | ||||
| 			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] { | ||||
| 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo), | ||||
| 					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", | ||||
| 						poolIdx, setIdx, poolWriteQuorums[poolIdx])) | ||||
| 				result.UnhealthyPools = append(result.UnhealthyPools, struct { | ||||
| 					Maintenance                bool | ||||
| 					PoolID, SetID, WriteQuorum int | ||||
| 				}{ | ||||
| 					Maintenance: opts.Maintenance, | ||||
| 					SetID:       setIdx, | ||||
| 					PoolID:      poolIdx, | ||||
| 					WriteQuorum: poolWriteQuorums[poolIdx], | ||||
| 				}) | ||||
| 			} | ||||
| 		} | ||||
| 		if len(result.UnhealthyPools) > 0 { | ||||
| 			// We have unhealthy pools return error.
 | ||||
| 			return result | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// when maintenance is not specified we don't have
 | ||||
| 	// to look at the healing side of the code.
 | ||||
| 	if !opts.Maintenance { | ||||
|  |  | |||
|  | @ -28,22 +28,17 @@ import ( | |||
| 
 | ||||
| const unavailable = "offline" | ||||
| 
 | ||||
| func isServerNotInitialized() bool { | ||||
| 	return newObjectLayerFn() == nil | ||||
| } | ||||
| 
 | ||||
| // ClusterCheckHandler returns if the server is ready for requests.
 | ||||
| func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) { | ||||
| 	ctx := newContext(r, w, "ClusterCheckHandler") | ||||
| 
 | ||||
| 	if isServerNotInitialized() { | ||||
| 	objLayer := newObjectLayerFn() | ||||
| 	if objLayer == nil { | ||||
| 		w.Header().Set(xhttp.MinIOServerStatus, unavailable) | ||||
| 		writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) | ||||
| 		return | ||||
| 	} | ||||
| 
 | ||||
| 	objLayer := newObjectLayerFn() | ||||
| 
 | ||||
| 	ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline()) | ||||
| 	defer cancel() | ||||
| 
 | ||||
|  | @ -52,16 +47,13 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) { | |||
| 		DeploymentType: r.Form.Get("deployment-type"), | ||||
| 	} | ||||
| 	result := objLayer.Health(ctx, opts) | ||||
| 	if result.WriteQuorum > 0 { | ||||
| 	w.Header().Set(xhttp.MinIOWriteQuorum, strconv.Itoa(result.WriteQuorum)) | ||||
| 	} | ||||
| 	w.Header().Set(xhttp.MinIOStorageClassDefaults, strconv.FormatBool(result.UsingDefaults)) | ||||
| 
 | ||||
| 	if !result.Healthy { | ||||
| 	// return how many drives are being healed if any
 | ||||
| 	if result.HealingDrives > 0 { | ||||
| 		w.Header().Set(xhttp.MinIOHealingDrives, strconv.Itoa(result.HealingDrives)) | ||||
| 	} | ||||
| 	if !result.Healthy { | ||||
| 		// As a maintenance call we are purposefully asked to be taken
 | ||||
| 		// down, this is for orchestrators to know if we can safely
 | ||||
| 		// take this server down, return appropriate error.
 | ||||
|  | @ -79,14 +71,13 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) { | |||
| func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) { | ||||
| 	ctx := newContext(r, w, "ClusterReadCheckHandler") | ||||
| 
 | ||||
| 	if isServerNotInitialized() { | ||||
| 	objLayer := newObjectLayerFn() | ||||
| 	if objLayer == nil { | ||||
| 		w.Header().Set(xhttp.MinIOServerStatus, unavailable) | ||||
| 		writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) | ||||
| 		return | ||||
| 	} | ||||
| 
 | ||||
| 	objLayer := newObjectLayerFn() | ||||
| 
 | ||||
| 	ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline()) | ||||
| 	defer cancel() | ||||
| 
 | ||||
|  | @ -106,17 +97,17 @@ func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) { | |||
| 
 | ||||
| // LivenessCheckHandler - Checks if the process is up. Always returns success.
 | ||||
| func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) { | ||||
| 	peerCall := r.Header.Get("x-minio-from-peer") != "" | ||||
| 
 | ||||
| 	if peerCall { | ||||
| 		return | ||||
| 	} | ||||
| 
 | ||||
| 	if isServerNotInitialized() { | ||||
| 	objLayer := newObjectLayerFn() | ||||
| 	if objLayer == nil { | ||||
| 		// Service not initialized yet
 | ||||
| 		w.Header().Set(xhttp.MinIOServerStatus, unavailable) | ||||
| 	} | ||||
| 
 | ||||
| 	peerCall := r.Header.Get(xhttp.MinIOPeerCall) != "" | ||||
| 	if peerCall { | ||||
| 		return | ||||
| 	} | ||||
| 
 | ||||
| 	if int(globalHTTPStats.loadRequestsInQueue()) > globalAPIConfig.getRequestsPoolCapacity() { | ||||
| 		apiErr := getAPIError(ErrBusy) | ||||
| 		switch r.Method { | ||||
|  |  | |||
|  | @ -54,6 +54,7 @@ func init() { | |||
| 		getClusterTierMetrics(), | ||||
| 		getClusterUsageMetrics(), | ||||
| 		getKMSMetrics(), | ||||
| 		getClusterHealthMetrics(), | ||||
| 	} | ||||
| 
 | ||||
| 	peerMetricsGroups = []*MetricsGroup{ | ||||
|  | @ -2642,6 +2643,63 @@ func getLocalDriveStorageMetrics() *MetricsGroup { | |||
| 	return mg | ||||
| } | ||||
| 
 | ||||
| func getClusterWriteQuorumMD() MetricDescription { | ||||
| 	return MetricDescription{ | ||||
| 		Namespace: clusterMetricNamespace, | ||||
| 		Subsystem: "write", | ||||
| 		Name:      "quorum", | ||||
| 		Help:      "Maximum write quorum across all pools and sets", | ||||
| 		Type:      gaugeMetric, | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func getClusterHealthStatusMD() MetricDescription { | ||||
| 	return MetricDescription{ | ||||
| 		Namespace: clusterMetricNamespace, | ||||
| 		Subsystem: "health", | ||||
| 		Name:      "status", | ||||
| 		Help:      "Get current cluster health status", | ||||
| 		Type:      gaugeMetric, | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func getClusterHealthMetrics() *MetricsGroup { | ||||
| 	mg := &MetricsGroup{ | ||||
| 		cacheInterval: 10 * time.Second, | ||||
| 	} | ||||
| 	mg.RegisterRead(func(ctx context.Context) (metrics []Metric) { | ||||
| 		objLayer := newObjectLayerFn() | ||||
| 		// Service not initialized yet
 | ||||
| 		if objLayer == nil { | ||||
| 			return | ||||
| 		} | ||||
| 
 | ||||
| 		metrics = make([]Metric, 0, 2) | ||||
| 
 | ||||
| 		opts := HealthOptions{} | ||||
| 		result := objLayer.Health(ctx, opts) | ||||
| 
 | ||||
| 		metrics = append(metrics, Metric{ | ||||
| 			Description: getClusterWriteQuorumMD(), | ||||
| 			Value:       float64(result.WriteQuorum), | ||||
| 		}) | ||||
| 
 | ||||
| 		health := 1 | ||||
| 		if !result.Healthy { | ||||
| 			health = 0 | ||||
| 		} | ||||
| 
 | ||||
| 		metrics = append(metrics, Metric{ | ||||
| 			Description: getClusterHealthStatusMD(), | ||||
| 			Value:       float64(health), | ||||
| 		}) | ||||
| 
 | ||||
| 		return | ||||
| 	}) | ||||
| 
 | ||||
| 	return mg | ||||
| } | ||||
| 
 | ||||
| func getClusterStorageMetrics() *MetricsGroup { | ||||
| 	mg := &MetricsGroup{ | ||||
| 		cacheInterval: 1 * time.Minute, | ||||
|  |  | |||
|  | @ -138,8 +138,8 @@ func isServerResolvable(endpoint Endpoint, timeout time.Duration) error { | |||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	req.Header.Set("x-minio-from-peer", "true") | ||||
| 	// Indicate that the liveness check for a peer call
 | ||||
| 	req.Header.Set(xhttp.MinIOPeerCall, "true") | ||||
| 
 | ||||
| 	resp, err := httpClient.Do(req) | ||||
| 	if err != nil { | ||||
|  |  | |||
|  | @ -40,6 +40,8 @@ These metrics can be obtained from any MinIO server once per collection. | |||
| | `minio_cluster_kms_uptime`                    | The time the KMS has been up and running in seconds.                                                            | | ||||
| | `minio_cluster_nodes_offline_total`           | Total number of MinIO nodes offline.                                                                            | | ||||
| | `minio_cluster_nodes_online_total`            | Total number of MinIO nodes online.                                                                             | | ||||
| | `minio_cluster_write_quorum`                  | Maximum write quorum across all pools and sets                                                                  | | ||||
| | `minio_cluster_health_status`                 | Get current cluster health status                                                                               | | ||||
| | `minio_heal_objects_errors_total`             | Objects for which healing failed in current self healing run.                                                   | | ||||
| | `minio_heal_objects_heal_total`               | Objects healed in current self healing run.                                                                     | | ||||
| | `minio_heal_objects_total`                    | Objects scanned in current self healing run.                                                                    | | ||||
|  |  | |||
|  | @ -152,6 +152,9 @@ const ( | |||
| 	// Deployment id.
 | ||||
| 	MinioDeploymentID = "x-minio-deployment-id" | ||||
| 
 | ||||
| 	// Peer call
 | ||||
| 	MinIOPeerCall = "x-minio-from-peer" | ||||
| 
 | ||||
| 	// Server-Status
 | ||||
| 	MinIOServerStatus = "x-minio-server-status" | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue